This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code. Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I. When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file). The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
The bibliographic references used for this practice have been: (Baayen 2008; Hothorn and Everitt 2014; Hyndman and Athanasopoulos 2021; Liviano Solas and Pujol Jover, n.d.; Teetor 2011; Vegas Lozano, n.d.).
# At section - Data types and modifications
if(!require(knitr)){
install.packages('knitr', repos='http://cran.us.r-project.org')
library(knitr)}
## Loading required package: knitr
if(!require(latexpdf)){
install.packages('latexpdf', repos='http://cran.us.r-project.org')
library(latexpdf)}
## Loading required package: latexpdf
if(!require(latex2exp)){
install.packages('latex2exp', repos='http://cran.us.r-project.org')
library(latex2exp)}
## Loading required package: latex2exp
if(!require(data.table)){
install.packages('data.table', repos='http://cran.us.r-project.org')
library(data.table)}
## Loading required package: data.table
if(!require(tidyverse)){
install.packages("tidyverse", repos='http://cran.us.r-project.org')
library(tidyverse)}
## Loading required package: tidyverse
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.3
## v tibble 3.0.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
if(!require(VIM)){
install.packages('VIM', repos='http://cran.us.r-project.org')
library(VIM)}
## Loading required package: VIM
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
if(!require(imputeTS)){
install.packages("imputeTS", repos='http://cran.us.r-project.org')
library(imputeTS)}
## Loading required package: imputeTS
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
if(!require(xts)){
install.packages("xts", repos='http://cran.us.r-project.org')
library(xts)}
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
##
## na.locf
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following objects are masked from 'package:data.table':
##
## first, last
if(!require(tsbox)){
install.packages("tsbox", repos='http://cran.us.r-project.org')
library(tsbox)}
## Loading required package: tsbox
# At section - Visual analysis
if(!require(fpp3)){
install.packages("fpp3", repos='http://cran.us.r-project.org')
library(fpp3)}
## Loading required package: fpp3
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- fpp3 0.4.0 --
## v lubridate 1.7.8 v feasts 0.2.1
## v tsibble 1.0.0 v fable 0.3.0
## v tsibbledata 0.3.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- fpp3_conflicts --
## x dplyr::between() masks data.table::between()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x xts::first() masks dplyr::first(), data.table::first()
## x lubridate::hour() masks data.table::hour()
## x tsibble::index() masks zoo::index()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval() masks lubridate::interval()
## x lubridate::isoweek() masks data.table::isoweek()
## x tsibble::key() masks data.table::key()
## x dplyr::lag() masks stats::lag()
## x xts::last() masks dplyr::last(), data.table::last()
## x lubridate::mday() masks data.table::mday()
## x lubridate::minute() masks data.table::minute()
## x lubridate::month() masks data.table::month()
## x lubridate::quarter() masks data.table::quarter()
## x lubridate::second() masks data.table::second()
## x tsibble::setdiff() masks base::setdiff()
## x purrr::transpose() masks data.table::transpose()
## x tsibble::union() masks base::union()
## x lubridate::wday() masks data.table::wday()
## x lubridate::week() masks data.table::week()
## x lubridate::yday() masks data.table::yday()
## x lubridate::year() masks data.table::year()
if(!require(corrplot)){
install.packages('corrplot', repos='http://cran.us.r-project.org')
library(corrplot)}
## Loading required package: corrplot
## corrplot 0.84 loaded
if(!require(DescTools)){
install.packages("DescTools", repos='http://cran.us.r-project.org')
library(DescTools)}
## Loading required package: DescTools
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:fabletools':
##
## MAE, MAPE, MSE, RMSE
## The following object is masked from 'package:data.table':
##
## %like%
# At sections - ARIMA
#if(!require(forecast)){
# install.packages('forecast', repos='http://cran.us.r-project.org')
# library(forecast)}
#if(!require(tseries)){
# install.packages('tseries', repos='http://cran.us.r-project.org')
# library(tseries)}
#if(!require(astsa)){
# install.packages('astsa', repos='http://cran.us.r-project.org')
# library(astsa)}
#if(!require(psych)){
# install.packages("psych", repos='http://cran.us.r-project.org')
# library(psych)}
#if(!require(stats)){
# install.packages("stats", repos='http://cran.us.r-project.org')
# library(stats)}
#if(!require(keras)){
# install.packages('keras', repos='http://cran.us.r-project.org')
# library(keras)}
#if(!require(tensorflow)){
# install.packages('tensorflow', repos='http://cran.us.r-project.org')
# library(tensorflow)}
#if(!require(DataExplorer)){
# install.packages('DataExplorer', repos='http://cran.us.r-project.org')
# library(DataExplorer)}
knitr::opts_chunk$set(echo = TRUE)
Data is loaded from the sources stated at PEC1 and PEC2 (CNE, INE and Google).
#library(dplyr)
# Source INE
EM3 <- read.csv('EM3-Movimiento de personas por provincias.csv',
header=TRUE,
sep = ";",
stringsAsFactors = FALSE)
# Source Google
Google <- read.csv('Google-2020_ES_Region_Mobility_Report.csv',
header=TRUE,
sep = ";",
stringsAsFactors = FALSE)
# Source CNE (here sep is ",")
CNE_tecnica <- read.csv('CNE-casos_tecnica_provincia.csv',
header=TRUE,
sep = ",",
stringsAsFactors = FALSE)
CNE_casos <- read.csv('CNE-casos_hosp_uci_def_sexo_edad_provres.csv',
header=TRUE,
sep = ",",
stringsAsFactors = FALSE)
We are gonig to check the type of variable that corresponds to each of the variables (numerical, factor, etc.) and missing data / values or other anomalies in each dataset.
Here we have the mobility of people by provinces (we can see 146 rows by province, that correspond to days). In order to facilitate the comparison, a valid reference date for the mobility of the population should be considered. The “normal” date for this study, has been considered as the one that results from the average of the days 18 (Monday) to 21 (Thursday) of November 2019. It is indicated in the tables as the reference date 18/11/2019.
# Source INE
summary(EM3)
## Zonas.de.movilidad Periodo Total
## Length:9198 Length:9198 Length:9198
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
head(str(EM3,vec.len=2))
## 'data.frame': 9198 obs. of 3 variables:
## $ Zonas.de.movilidad: chr "Almería" "Almería" ...
## $ Periodo : chr "30/12/2020" "27/12/2020" ...
## $ Total : chr "17,17" "11,53" ...
## NULL
table(EM3$Zonas.de.movilidad)
##
## Albacete Alicante/Alacant Almería
## 146 146 146
## Araba/Álava Asturias Ávila
## 146 146 146
## Badajoz Balears, Illes Barcelona
## 146 146 146
## Bizkaia Burgos Cáceres
## 146 146 146
## Cádiz Cantabria Castellón/Castelló
## 146 146 146
## Ceuta Ciudad Real Córdoba
## 146 146 146
## Coruña, A Cuenca Formentera
## 146 146 146
## Fuerteventura Gipuzkoa Girona
## 146 146 146
## Gomera, La Gran Canaria Granada
## 146 146 146
## Guadalajara Hierro, El Huelva
## 146 146 146
## Huesca Ibiza Jaén
## 146 146 146
## Lanzarote León Lleida
## 146 146 146
## Lugo Madrid Málaga
## 146 146 146
## Mallorca Melilla Menorca
## 146 146 146
## Murcia Navarra Ourense
## 146 146 146
## Palencia Palma, La Palmas, Las
## 146 146 146
## Pontevedra Rioja, La Salamanca
## 146 146 146
## Santa Cruz de Tenerife Segovia Sevilla
## 146 146 146
## Soria Tarragona Tenerife
## 146 146 146
## Teruel Toledo Valencia/València
## 146 146 146
## Valladolid Zamora Zaragoza
## 146 146 146
We are going to transform:
EM3$Total <- sub(",", ".", EM3$Total)
EM3$Total <- as.numeric(EM3$Total)
EM3$Periodo <- as.Date(EM3$Periodo,format="%d/%m/%Y")
head(EM3)
## Zonas.de.movilidad Periodo Total
## 1 Almería 2020-12-30 17.17
## 2 Almería 2020-12-27 11.53
## 3 Almería 2020-12-23 17.81
## 4 Almería 2020-12-20 12.13
## 5 Almería 2020-12-16 18.28
## 6 Almería 2020-12-13 11.97
Due to the nature of this dataset we have to transpose it in order to analyse the missing values by province and impute them. There are some dates that are not provided by EM3 study.
#library(data.table)
# Transpose dataframe
EM3_t<-dcast(EM3, Periodo~Zonas.de.movilidad)#, fill=NA)
#library(tidyverse)
# Create dates missing (for time series).
# Note: Acccording INE, some "dates" are not provided.
# We have to generate them
EM3_t<-EM3_t %>%
complete(Periodo = seq.Date(min(Periodo), max(Periodo), by="day"))
# Filter the interest period according INE EM3 study
# "2019-11-18" is the reference date EM3 study (for us it is excluded)
EM3_t<- EM3_t %>%
filter(Periodo <= "2019-11-18" | Periodo >= "2020-03-16")
EM3_t
## # A tibble: 291 x 64
## Periodo Albacete `Alicante/Alaca~ Almería `Araba/Álava` Asturias Ávila
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-11-18 25.2 28.1 24.4 31.9 29.9 26.6
## 2 2020-03-16 9.9 14.4 11.0 15.9 13.1 9.44
## 3 2020-03-17 NA NA NA NA NA NA
## 4 2020-03-18 9.51 13.4 7.28 14.5 12.0 9.17
## 5 2020-03-19 NA NA NA NA NA NA
## 6 2020-03-20 8.75 12.0 6.87 11.9 11.3 8.69
## 7 2020-03-21 NA NA NA NA NA NA
## 8 2020-03-22 4.5 6.14 4.19 6.46 5.64 4.53
## 9 2020-03-23 NA NA NA NA NA NA
## 10 2020-03-24 9.02 10.9 8.98 13.3 11.2 8.26
## # ... with 281 more rows, and 57 more variables: Badajoz <dbl>, `Balears,
## # Illes` <dbl>, Barcelona <dbl>, Bizkaia <dbl>, Burgos <dbl>, Cáceres <dbl>,
## # Cádiz <dbl>, Cantabria <dbl>, `Castellón/Castelló` <dbl>, Ceuta <dbl>,
## # `Ciudad Real` <dbl>, Córdoba <dbl>, `Coruña, A` <dbl>, Cuenca <dbl>,
## # Formentera <dbl>, Fuerteventura <dbl>, Gipuzkoa <dbl>, Girona <dbl>,
## # `Gomera, La` <dbl>, `Gran Canaria` <dbl>, Granada <dbl>, Guadalajara <dbl>,
## # `Hierro, El` <dbl>, Huelva <dbl>, Huesca <dbl>, Ibiza <dbl>, Jaén <dbl>,
## # Lanzarote <dbl>, León <dbl>, Lleida <dbl>, Lugo <dbl>, Madrid <dbl>,
## # Málaga <dbl>, Mallorca <dbl>, Melilla <dbl>, Menorca <dbl>, Murcia <dbl>,
## # Navarra <dbl>, Ourense <dbl>, Palencia <dbl>, `Palma, La` <dbl>, `Palmas,
## # Las` <dbl>, Pontevedra <dbl>, `Rioja, La` <dbl>, Salamanca <dbl>, `Santa
## # Cruz de Tenerife` <dbl>, Segovia <dbl>, Sevilla <dbl>, Soria <dbl>,
## # Tarragona <dbl>, Tenerife <dbl>, Teruel <dbl>, Toledo <dbl>,
## # `Valencia/València` <dbl>, Valladolid <dbl>, Zamora <dbl>, Zaragoza <dbl>
We check the missing values by province (we are close to 150 by province).
#library(VIM)
aggr(EM3_t[,-1], col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(EM3_t[,-1]), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Albacete 0.4982818
## Alicante/Alacant 0.4982818
## Almería 0.4982818
## Araba/Álava 0.4982818
## Asturias 0.4982818
## Ávila 0.4982818
## Badajoz 0.4982818
## Balears, Illes 0.4982818
## Barcelona 0.4982818
## Bizkaia 0.4982818
## Burgos 0.4982818
## Cáceres 0.4982818
## Cádiz 0.4982818
## Cantabria 0.4982818
## Castellón/Castelló 0.4982818
## Ceuta 0.4982818
## Ciudad Real 0.4982818
## Córdoba 0.4982818
## Coruña, A 0.4982818
## Cuenca 0.4982818
## Formentera 0.4982818
## Fuerteventura 0.4982818
## Gipuzkoa 0.4982818
## Girona 0.4982818
## Gomera, La 0.4982818
## Gran Canaria 0.4982818
## Granada 0.4982818
## Guadalajara 0.4982818
## Hierro, El 0.4982818
## Huelva 0.4982818
## Huesca 0.4982818
## Ibiza 0.4982818
## Jaén 0.4982818
## Lanzarote 0.4982818
## León 0.4982818
## Lleida 0.4982818
## Lugo 0.4982818
## Madrid 0.4982818
## Málaga 0.4982818
## Mallorca 0.4982818
## Melilla 0.4982818
## Menorca 0.4982818
## Murcia 0.4982818
## Navarra 0.4982818
## Ourense 0.4982818
## Palencia 0.4982818
## Palma, La 0.4982818
## Palmas, Las 0.4982818
## Pontevedra 0.4982818
## Rioja, La 0.4982818
## Salamanca 0.4982818
## Santa Cruz de Tenerife 0.4982818
## Segovia 0.4982818
## Sevilla 0.4982818
## Soria 0.4982818
## Tarragona 0.4982818
## Tenerife 0.4982818
## Teruel 0.4982818
## Toledo 0.4982818
## Valencia/València 0.4982818
## Valladolid 0.4982818
## Zamora 0.4982818
## Zaragoza 0.4982818
EM3_t %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
We impute the missing values following the principales stated for imputeTS. Thanks to this approach we almost double the amount of data for analysis by province (It was selected “na_seadec” function due to it covers seasonality aspects -weekdays/weekends in our case-).
It is needed to transform the dataframe to a time series object.
# Used to convert dataframe to ts object
#library(xts)
EM3_t_ts<-xts(EM3_t[-1],EM3_t$Periodo)
# Impute the missing values with na_kalman, na_seadec, na_interpolation & na_seasplit
#library(imputeTS)
imp <- na_kalman(EM3_t_ts[,1])
ggplot_na_imputations(EM3_t_ts[,1], imp)
imp2 <- na_seadec(EM3_t_ts[,1])
ggplot_na_imputations(EM3_t_ts[,1], imp2)
imp3 <- na_seasplit(EM3_t_ts[,1])
ggplot_na_imputations(EM3_t_ts[,1], imp3)
imp4 <- na_interpolation(EM3_t_ts[,1])
ggplot_na_imputations(EM3_t_ts[,1], imp4)
# We select na_seadec for the dataset
EM3_t_ts <- na_seadec(EM3_t_ts)
plot(EM3_t_ts[,1])
# We convert the time series object to a dataframe
#library(tsbox)
EM3 <- ts_df(EM3_t_ts)
names(EM3)[names(EM3) == "id"] <- "Zonas.de.movilidad"
names(EM3)[names(EM3) == "time"] <- "Periodo"
names(EM3)[names(EM3) == "value"] <- "Total"
# Transpose dataframe
EM3_t<-dcast(EM3, Periodo~Zonas.de.movilidad, fill=NA)
# We check again missing values (result should be zero)
aggr(EM3_t[,-1], col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(EM3_t[,-1]), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Albacete 0
## Alicante/Alacant 0
## Almería 0
## Araba/Álava 0
## Asturias 0
## Ávila 0
## Badajoz 0
## Balears, Illes 0
## Barcelona 0
## Bizkaia 0
## Burgos 0
## Cáceres 0
## Cádiz 0
## Cantabria 0
## Castellón/Castelló 0
## Ceuta 0
## Ciudad Real 0
## Córdoba 0
## Coruña, A 0
## Cuenca 0
## Formentera 0
## Fuerteventura 0
## Gipuzkoa 0
## Girona 0
## Gomera, La 0
## Gran Canaria 0
## Granada 0
## Guadalajara 0
## Hierro, El 0
## Huelva 0
## Huesca 0
## Ibiza 0
## Jaén 0
## Lanzarote 0
## León 0
## Lleida 0
## Lugo 0
## Madrid 0
## Málaga 0
## Mallorca 0
## Melilla 0
## Menorca 0
## Murcia 0
## Navarra 0
## Ourense 0
## Palencia 0
## Palma, La 0
## Palmas, Las 0
## Pontevedra 0
## Rioja, La 0
## Salamanca 0
## Santa Cruz de Tenerife 0
## Segovia 0
## Sevilla 0
## Soria 0
## Tarragona 0
## Tenerife 0
## Teruel 0
## Toledo 0
## Valencia/València 0
## Valladolid 0
## Zamora 0
## Zaragoza 0
EM3_t %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
head(str(EM3_t,vec.len=2))
## 'data.frame': 291 obs. of 64 variables:
## $ Periodo : Date, format: "2019-11-18" "2020-03-16" ...
## $ Albacete : num 25.2 9.9 ...
## $ Alicante/Alacant : num 28.1 14.4 ...
## $ Almería : num 24.4 11 ...
## $ Araba/Álava : num 31.9 15.9 ...
## $ Asturias : num 29.9 13.1 ...
## $ Ávila : num 26.62 9.44 ...
## $ Badajoz : num 19.5 10.7 ...
## $ Balears, Illes : num 25.9 12.3 ...
## $ Barcelona : num 34.9 14.6 ...
## $ Bizkaia : num 33.9 17.9 ...
## $ Burgos : num 24.9 11.3 ...
## $ Cáceres : num 21.37 9.08 ...
## $ Cádiz : num 24.5 10.7 ...
## $ Cantabria : num 31.5 14.5 ...
## $ Castellón/Castelló : num 26.9 11.7 ...
## $ Ceuta : num 31.9 12.3 ...
## $ Ciudad Real : num 17.07 8.29 ...
## $ Córdoba : num 22.4 11.9 ...
## $ Coruña, A : num 26.8 15.8 ...
## $ Cuenca : num 20.27 8.54 ...
## $ Formentera : num 12.98 3.08 ...
## $ Fuerteventura : num 11.9 5.76 5.31 4.86 4.62 ...
## $ Gipuzkoa : num 31.3 14.1 ...
## $ Girona : num 25.6 10.9 ...
## $ Gomera, La : num 11.43 4.26 ...
## $ Gran Canaria : num 31.4 16 ...
## $ Granada : num 24.2 11.4 ...
## $ Guadalajara : num 31.3 12.8 ...
## $ Hierro, El : num 14.82 5.75 ...
## $ Huelva : num 21.8 11.4 ...
## $ Huesca : num 25.5 10.3 ...
## $ Ibiza : num 20.3 10.1 ...
## $ Jaén : num 18.71 8.81 ...
## $ Lanzarote : num 25.1 13.6 ...
## $ León : num 29.8 13.2 ...
## $ Lleida : num 28.4 11.9 ...
## $ Lugo : num 23.6 12.4 ...
## $ Madrid : num 36.7 13.9 ...
## $ Málaga : num 23.7 11.4 ...
## $ Mallorca : num 28.1 13.3 ...
## $ Melilla : num 35 12.5 ...
## $ Menorca : num 16.26 7.21 ...
## $ Murcia : num 24.6 12.4 ...
## $ Navarra : num 30.6 13.7 ...
## $ Ourense : num 27.5 15.6 ...
## $ Palencia : num 31.1 12.9 ...
## $ Palma, La : num 24.6 13.1 ...
## $ Palmas, Las : num 28.5 14.6 ...
## $ Pontevedra : num 26.2 17.1 ...
## $ Rioja, La : num 28 12.1 ...
## $ Salamanca : num 28.1 12.9 ...
## $ Santa Cruz de Tenerife: num 28 13.7 ...
## $ Segovia : num 29 12.6 ...
## $ Sevilla : num 25.9 12.3 ...
## $ Soria : num 17.67 7.85 ...
## $ Tarragona : num 28.4 11.4 ...
## $ Tenerife : num 28.9 14.1 ...
## $ Teruel : num 16.35 6.77 ...
## $ Toledo : num 25.7 12.3 ...
## $ Valencia/València : num 31 15.3 ...
## $ Valladolid : num 29 14.3 ...
## $ Zamora : num 26 11.5 ...
## $ Zaragoza : num 32.3 14.8 ...
## NULL
summary(EM3_t)
## Periodo Albacete Alicante/Alacant Almería
## Min. :2019-11-18 Min. : 3.430 Min. : 5.72 Min. : 4.10
## 1st Qu.:2020-05-26 1st Qu.: 9.738 1st Qu.:14.42 1st Qu.:11.31
## Median :2020-08-07 Median :11.960 Median :17.19 Median :13.84
## Mean :2020-08-06 Mean :11.592 Mean :16.16 Mean :13.19
## 3rd Qu.:2020-10-18 3rd Qu.:13.873 3rd Qu.:18.70 3rd Qu.:15.56
## Max. :2020-12-30 Max. :25.250 Max. :28.12 Max. :24.35
## Araba/Álava Asturias Ávila Badajoz
## Min. : 6.30 Min. : 5.24 Min. : 4.51 Min. : 4.90
## 1st Qu.:14.48 1st Qu.:11.93 1st Qu.:10.10 1st Qu.:11.29
## Median :18.09 Median :16.28 Median :12.71 Median :13.10
## Mean :17.52 Mean :15.65 Mean :11.93 Mean :12.74
## 3rd Qu.:21.11 3rd Qu.:19.41 3rd Qu.:14.26 3rd Qu.:14.79
## Max. :31.92 Max. :29.94 Max. :26.62 Max. :19.46
## Balears, Illes Barcelona Bizkaia Burgos
## Min. : 3.92 Min. : 6.42 Min. : 7.53 Min. : 3.97
## 1st Qu.:13.09 1st Qu.:13.95 1st Qu.:15.41 1st Qu.:10.74
## Median :16.14 Median :17.51 Median :19.70 Median :13.06
## Mean :15.29 Mean :17.05 Mean :19.21 Mean :12.68
## 3rd Qu.:18.46 3rd Qu.:20.25 3rd Qu.:23.06 3rd Qu.:15.26
## Max. :25.95 Max. :34.90 Max. :33.93 Max. :24.88
## Cáceres Cádiz Cantabria Castellón/Castelló
## Min. : 4.02 Min. : 5.42 Min. : 5.84 Min. : 4.95
## 1st Qu.:10.34 1st Qu.:12.59 1st Qu.:13.40 1st Qu.:12.59
## Median :12.39 Median :15.98 Median :17.57 Median :15.00
## Mean :11.81 Mean :14.92 Mean :16.93 Mean :14.43
## 3rd Qu.:13.84 3rd Qu.:17.82 3rd Qu.:20.63 3rd Qu.:16.76
## Max. :21.37 Max. :24.46 Max. :31.50 Max. :26.93
## Ceuta Ciudad Real Córdoba Coruña, A
## Min. : 4.87 Min. : 3.080 Min. : 5.68 Min. : 6.02
## 1st Qu.:11.90 1st Qu.: 8.291 1st Qu.:11.49 1st Qu.:12.73
## Median :13.93 Median :10.050 Median :13.69 Median :15.84
## Mean :13.34 Mean : 9.597 Mean :13.41 Mean :15.63
## 3rd Qu.:15.34 3rd Qu.:11.453 3rd Qu.:15.67 3rd Qu.:18.96
## Max. :31.88 Max. :17.070 Max. :22.38 Max. :26.84
## Cuenca Formentera Fuerteventura Gipuzkoa
## Min. : 3.010 Min. : 0.830 Min. : 1.140 Min. : 4.71
## 1st Qu.: 9.298 1st Qu.: 2.520 1st Qu.: 5.745 1st Qu.:11.95
## Median :11.620 Median : 3.562 Median : 7.490 Median :15.49
## Mean :10.843 Mean : 4.559 Mean : 6.954 Mean :14.94
## 3rd Qu.:12.892 3rd Qu.: 7.320 3rd Qu.: 8.503 3rd Qu.:18.27
## Max. :20.270 Max. :12.980 Max. :11.900 Max. :31.31
## Girona Gomera, La Gran Canaria Granada
## Min. : 4.16 Min. : 1.340 Min. : 5.70 Min. : 5.71
## 1st Qu.:10.24 1st Qu.: 4.860 1st Qu.:15.07 1st Qu.:11.38
## Median :14.60 Median : 6.635 Median :18.21 Median :14.72
## Mean :13.45 Mean : 6.111 Mean :17.54 Mean :14.07
## 3rd Qu.:16.65 3rd Qu.: 7.416 3rd Qu.:20.93 3rd Qu.:16.66
## Max. :25.56 Max. :11.430 Max. :31.44 Max. :24.25
## Guadalajara Hierro, El Huelva Huesca
## Min. : 4.89 Min. : 1.560 Min. : 5.51 Min. : 4.24
## 1st Qu.:12.19 1st Qu.: 6.645 1st Qu.:10.94 1st Qu.:11.12
## Median :14.93 Median : 8.470 Median :13.65 Median :13.77
## Mean :14.52 Mean : 7.933 Mean :13.23 Mean :12.96
## 3rd Qu.:17.29 3rd Qu.: 9.710 3rd Qu.:15.62 3rd Qu.:15.14
## Max. :31.33 Max. :14.820 Max. :21.79 Max. :25.48
## Ibiza Jaén Lanzarote León
## Min. : 2.960 Min. : 4.17 Min. : 4.73 Min. : 5.56
## 1st Qu.: 9.411 1st Qu.: 9.43 1st Qu.:12.87 1st Qu.:12.58
## Median :11.550 Median :11.77 Median :14.89 Median :15.65
## Mean :12.017 Mean :11.24 Mean :14.22 Mean :14.92
## 3rd Qu.:15.325 3rd Qu.:13.22 3rd Qu.:16.58 3rd Qu.:17.50
## Max. :20.650 Max. :18.71 Max. :25.07 Max. :29.80
## Lleida Lugo Madrid Málaga
## Min. : 5.16 Min. : 4.88 Min. : 6.03 Min. : 4.56
## 1st Qu.:11.73 1st Qu.:10.94 1st Qu.:13.21 1st Qu.:12.25
## Median :14.89 Median :13.64 Median :16.58 Median :15.41
## Mean :14.25 Mean :13.11 Mean :16.19 Mean :14.51
## 3rd Qu.:16.81 3rd Qu.:15.65 3rd Qu.:19.51 3rd Qu.:17.51
## Max. :28.38 Max. :23.60 Max. :36.70 Max. :23.71
## Mallorca Melilla Menorca Murcia
## Min. : 4.36 Min. : 6.48 Min. : 1.590 Min. : 5.13
## 1st Qu.:14.31 1st Qu.:13.60 1st Qu.: 8.134 1st Qu.:11.81
## Median :17.52 Median :15.85 Median :10.110 Median :14.19
## Mean :16.44 Mean :15.37 Mean :10.846 Mean :13.97
## 3rd Qu.:19.72 3rd Qu.:17.39 3rd Qu.:15.880 3rd Qu.:16.45
## Max. :28.06 Max. :35.05 Max. :19.490 Max. :24.65
## Navarra Ourense Palencia Palma, La
## Min. : 5.33 Min. : 6.43 Min. : 5.05 Min. : 5.28
## 1st Qu.:13.45 1st Qu.:12.87 1st Qu.:12.18 1st Qu.:12.96
## Median :16.51 Median :15.62 Median :14.78 Median :15.39
## Mean :15.89 Mean :15.35 Mean :14.40 Mean :14.79
## 3rd Qu.:18.87 3rd Qu.:18.06 3rd Qu.:16.94 3rd Qu.:17.03
## Max. :30.61 Max. :27.48 Max. :31.07 Max. :24.63
## Palmas, Las Pontevedra Rioja, La Salamanca
## Min. : 5.11 Min. : 6.15 Min. : 4.74 Min. : 6.02
## 1st Qu.:13.98 1st Qu.:13.41 1st Qu.:12.51 1st Qu.:12.61
## Median :16.65 Median :17.23 Median :15.78 Median :15.07
## Mean :15.98 Mean :16.63 Mean :15.01 Mean :14.54
## 3rd Qu.:18.99 3rd Qu.:20.39 3rd Qu.:17.73 3rd Qu.:17.00
## Max. :28.53 Max. :26.22 Max. :28.00 Max. :28.12
## Santa Cruz de Tenerife Segovia Sevilla Soria
## Min. : 5.20 Min. : 4.96 Min. : 5.44 Min. : 3.03
## 1st Qu.:13.71 1st Qu.:11.83 1st Qu.:12.13 1st Qu.: 8.78
## Median :16.54 Median :14.48 Median :14.96 Median :10.95
## Mean :15.99 Mean :13.80 Mean :14.73 Mean :10.19
## 3rd Qu.:18.98 3rd Qu.:16.27 3rd Qu.:17.70 3rd Qu.:12.20
## Max. :28.02 Max. :29.04 Max. :25.95 Max. :17.67
## Tarragona Tenerife Teruel Toledo
## Min. : 4.93 Min. : 5.32 Min. : 1.950 Min. : 4.15
## 1st Qu.:10.64 1st Qu.:14.13 1st Qu.: 6.582 1st Qu.:11.00
## Median :14.95 Median :17.02 Median : 8.680 Median :13.52
## Mean :13.90 Mean :16.42 Mean : 8.210 Mean :13.01
## 3rd Qu.:16.90 3rd Qu.:19.58 3rd Qu.: 9.999 3rd Qu.:15.57
## Max. :28.38 Max. :28.87 Max. :16.350 Max. :25.70
## Valencia/València Valladolid Zamora Zaragoza
## Min. : 6.17 Min. : 5.55 Min. : 4.82 Min. : 5.80
## 1st Qu.:15.39 1st Qu.:12.60 1st Qu.:10.79 1st Qu.:13.44
## Median :18.16 Median :15.95 Median :13.09 Median :16.45
## Mean :17.53 Mean :15.71 Mean :12.79 Mean :16.37
## 3rd Qu.:20.38 3rd Qu.:19.10 3rd Qu.:14.95 3rd Qu.:19.37
## Max. :30.97 Max. :28.99 Max. :25.99 Max. :32.26
#library(DataExplorer)
#plot_histogram(EM3_t)
table(EM3$Zonas.de.movilidad)
##
## Albacete Alicante/Alacant Almería
## 291 291 291
## Araba/Álava Asturias Ávila
## 291 291 291
## Badajoz Balears, Illes Barcelona
## 291 291 291
## Bizkaia Burgos Cáceres
## 291 291 291
## Cádiz Cantabria Castellón/Castelló
## 291 291 291
## Ceuta Ciudad Real Córdoba
## 291 291 291
## Coruña, A Cuenca Formentera
## 291 291 291
## Fuerteventura Gipuzkoa Girona
## 291 291 291
## Gomera, La Gran Canaria Granada
## 291 291 291
## Guadalajara Hierro, El Huelva
## 291 291 291
## Huesca Ibiza Jaén
## 291 291 291
## Lanzarote León Lleida
## 291 291 291
## Lugo Madrid Málaga
## 291 291 291
## Mallorca Melilla Menorca
## 291 291 291
## Murcia Navarra Ourense
## 291 291 291
## Palencia Palma, La Palmas, Las
## 291 291 291
## Pontevedra Rioja, La Salamanca
## 291 291 291
## Santa Cruz de Tenerife Segovia Sevilla
## 291 291 291
## Soria Tarragona Tenerife
## 291 291 291
## Teruel Toledo Valencia/València
## 291 291 291
## Valladolid Zamora Zaragoza
## 291 291 291
Here we have data mobility from autonomus-communities and provinces.
#Source Google
summary(Google)
## country_region_code country_region sub_region_1 sub_region_2
## Length:24242 Length:24242 Length:24242 Length:24242
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## metro_area iso_3166_2_code census_fips_code place_id
## Mode:logical Length:24242 Mode:logical Length:24242
## NA's:24242 Class :character NA's:24242 Class :character
## Mode :character Mode :character
##
##
##
##
## date retail_and_recreation_percent_change_from_baseline
## Length:24242 Min. :-97.00
## Class :character 1st Qu.:-53.00
## Mode :character Median :-32.00
## Mean :-36.42
## 3rd Qu.:-17.00
## Max. : 71.00
## NA's :56
## grocery_and_pharmacy_percent_change_from_baseline
## Min. :-96.000
## 1st Qu.:-18.000
## Median : -4.000
## Mean : -9.973
## 3rd Qu.: 3.000
## Max. :194.000
## NA's :396
## parks_percent_change_from_baseline
## Min. :-94.0000
## 1st Qu.:-30.0000
## Median : -5.0000
## Mean : -0.0038
## 3rd Qu.: 22.0000
## Max. :543.0000
## NA's :305
## transit_stations_percent_change_from_baseline
## Min. :-100.00
## 1st Qu.: -46.00
## Median : -30.00
## Mean : -32.63
## 3rd Qu.: -16.00
## Max. : 177.00
## NA's :832
## workplaces_percent_change_from_baseline
## Min. :-92.0
## 1st Qu.:-37.0
## Median :-24.0
## Mean :-26.7
## 3rd Qu.:-13.0
## Max. : 55.0
## NA's :42
## residential_percent_change_from_baseline
## Min. :-10.000
## 1st Qu.: 4.000
## Median : 7.000
## Mean : 9.419
## 3rd Qu.: 13.000
## Max. : 48.000
## NA's :267
head(str(Google,vec.len=1))
## 'data.frame': 24242 obs. of 15 variables:
## $ country_region_code : chr "ES" ...
## $ country_region : chr "Spain" ...
## $ sub_region_1 : chr "" ...
## $ sub_region_2 : chr "" ...
## $ metro_area : logi NA ...
## $ iso_3166_2_code : chr "" ...
## $ census_fips_code : logi NA ...
## $ place_id : chr "ChIJi7xhMnjjQgwR7KNoB5Qs7KY" ...
## $ date : chr "15/02/2020" ...
## $ retail_and_recreation_percent_change_from_baseline: int 2 2 ...
## $ grocery_and_pharmacy_percent_change_from_baseline : int -1 3 ...
## $ parks_percent_change_from_baseline : int 26 13 ...
## $ transit_stations_percent_change_from_baseline : int 8 5 ...
## $ workplaces_percent_change_from_baseline : int 0 -1 ...
## $ residential_percent_change_from_baseline : int -2 -2 ...
## NULL
table(Google$sub_region_1)
##
## Andalusia Aragon Asturias
## 385 3465 1540 385
## Balearic Islands Basque Country Canary Islands Cantabria
## 385 1540 1155 385
## Castile-La Mancha Castile and León Catalonia Ceuta
## 2310 3850 1925 378
## Community of Madrid Extremadura Galicia La Rioja
## 385 1155 1925 385
## Melilla Navarre Region of Murcia Valencian Community
## 379 385 385 1540
table(Google$sub_region_2)
##
## A Coruña Ã\201lava
## 7687 385 385
## Ã\201vila Albacete Alicante
## 385 385 385
## AlmerÃa Badajoz Barcelona
## 385 385 385
## Biscay Burgos Cáceres
## 385 385 385
## Cádiz Córdoba Castellón
## 385 385 385
## Ciudad Real Cuenca Gipuzkoa
## 385 385 385
## Girona Granada Guadalajara
## 385 385 385
## Huelva Huesca Jaén
## 385 385 385
## Las Palmas León Lleida
## 385 385 385
## Lugo Málaga Palencia
## 385 385 385
## Pontevedra Province of Ourense Salamanca
## 385 385 385
## Santa Cruz de Tenerife Segovia Seville
## 385 385 385
## Soria Tarragona Teruel
## 385 385 385
## Toledo Valencia Valladolid
## 385 385 385
## Zamora Zaragoza
## 385 385
table(Google$iso_3166_2_code)
##
## ES-A ES-AB ES-AL ES-AN ES-AR ES-AS ES-AV ES-B ES-BA ES-BI ES-BU ES-C
## 385 385 385 385 385 385 385 385 385 385 385 385 385
## ES-CA ES-CB ES-CC ES-CE ES-CL ES-CM ES-CN ES-CO ES-CR ES-CS ES-CT ES-CU ES-EX
## 385 385 385 378 385 385 385 385 385 385 385 385 385
## ES-GA ES-GC ES-GI ES-GR ES-GU ES-H ES-HU ES-IB ES-J ES-L ES-LE ES-LU ES-MA
## 385 385 385 385 385 385 385 385 385 385 385 385 385
## ES-MC ES-MD ES-ML ES-NC ES-OR ES-P ES-PO ES-PV ES-RI ES-SA ES-SE ES-SG ES-SO
## 385 385 379 385 385 385 385 385 385 385 385 385 385
## ES-SS ES-T ES-TE ES-TF ES-TO ES-V ES-VA ES-VC ES-VI ES-Z ES-ZA
## 385 385 385 385 385 385 385 385 385 385 385
We check data grouped by autonomous communities and provinces.
Google %>% group_by(sub_region_1) %>% tally()
## # A tibble: 20 x 2
## sub_region_1 n
## <chr> <int>
## 1 "" 385
## 2 "Andalusia" 3465
## 3 "Aragon" 1540
## 4 "Asturias" 385
## 5 "Balearic Islands" 385
## 6 "Basque Country" 1540
## 7 "Canary Islands" 1155
## 8 "Cantabria" 385
## 9 "Castile-La Mancha" 2310
## 10 "Castile and León" 3850
## 11 "Catalonia" 1925
## 12 "Ceuta" 378
## 13 "Community of Madrid" 385
## 14 "Extremadura" 1155
## 15 "Galicia" 1925
## 16 "La Rioja" 385
## 17 "Melilla" 379
## 18 "Navarre" 385
## 19 "Region of Murcia" 385
## 20 "Valencian Community" 1540
Google %>% group_by(sub_region_1) %>% count(sub_region_2)
## # A tibble: 63 x 3
## # Groups: sub_region_1 [20]
## sub_region_1 sub_region_2 n
## <chr> <chr> <int>
## 1 "" "" 385
## 2 "Andalusia" "" 385
## 3 "Andalusia" "AlmerÃa" 385
## 4 "Andalusia" "Cádiz" 385
## 5 "Andalusia" "Córdoba" 385
## 6 "Andalusia" "Granada" 385
## 7 "Andalusia" "Huelva" 385
## 8 "Andalusia" "Jaén" 385
## 9 "Andalusia" "Málaga" 385
## 10 "Andalusia" "Seville" 385
## # ... with 53 more rows
In Spain there are autonomous communities (AC) and autonomous cities (C) that are considered as provinces (Pr). This is the case for:
In this data set, the empty values in the “sub_region_2” column, for the autonomous communities mentinoed, will be replaced by the value contained in the “sub_region_1” column (A). Also we are going to modify the names of the provinces that have special characters in order to adopt the INE standards (B). See note.
Note The following links states the provinces in Spain INE CCAA and its ISO codes are going to be used as tables of referencence.
# Modification provinces - A
Google$sub_region_2[Google$sub_region_1=="Balearic Islands"] <- "Balears, Illes"
Google$iso_3166_2_code[Google$sub_region_2=="Balears, Illes"] <- "PM"
Google$sub_region_2[Google$sub_region_1=="Asturias"] <- "Asturias"
Google$iso_3166_2_code[Google$sub_region_2=="Asturias"] <- "O"
Google$sub_region_2[Google$sub_region_1=="Cantabria"] <- "Cantabria"
Google$iso_3166_2_code[Google$sub_region_2=="Cantabria"] <- "S"
Google$sub_region_2[Google$sub_region_1=="Community of Madrid"] <- "Madrid"
Google$iso_3166_2_code[Google$sub_region_2=="Madrid"] <- "M"
Google$sub_region_2[Google$sub_region_1=="Region of Murcia"] <- "Murcia"
Google$iso_3166_2_code[Google$sub_region_2=="Murcia"] <- "MU"
Google$sub_region_2[Google$sub_region_1=="Navarre"] <- "Navarra"
Google$iso_3166_2_code[Google$sub_region_2=="Navarra"] <- "NA"
Google$sub_region_2[Google$sub_region_1=="La Rioja"] <- "Rioja, La"
Google$iso_3166_2_code[Google$sub_region_2=="Rioja, La"] <- "LO"
Google$sub_region_2[Google$sub_region_1=="Ceuta"] <- "Ceuta"
Google$iso_3166_2_code[Google$sub_region_2=="Ceuta"] <- "CE"
Google$sub_region_2[Google$sub_region_1=="Melilla"] <- "Melilla"
Google$iso_3166_2_code[Google$sub_region_2=="Melilla"] <- "ML"
# Modification provinces - B
Google$sub_region_2[Google$sub_region_2=="A Coruña"]<-"Coruña, A"
Google$sub_region_2[Google$sub_region_2=="Ã\u0081lava"]<-"Araba/Álava"
Google$sub_region_2[Google$sub_region_2=="Ã\u0081vila"]<-"Ávila"
Google$sub_region_2[Google$sub_region_2=="Alicante"]<-"Alicante/Alacant"
Google$sub_region_2[Google$sub_region_2=="Biscay"]<-"Bizkaia"
Google$sub_region_2[Google$sub_region_2=="Cáceres"]<-"Cáceres"
Google$sub_region_2[Google$sub_region_2=="Cádiz"]<-"Cádiz"
Google$sub_region_2[Google$sub_region_2=="Córdoba"]<-"Córdoba"
Google$sub_region_2[Google$sub_region_2=="Castellón"]<-"Castellón/Castelló"
Google$sub_region_2[Google$sub_region_2=="Jaén"]<-"Jaén"
Google$sub_region_2[Google$sub_region_2=="Las Palmas"]<-"Palmas, Las"
Google$sub_region_2[Google$sub_region_2=="León"]<-"León"
Google$sub_region_2[Google$sub_region_2=="Málaga"]<-"Málaga"
Google$sub_region_2[Google$sub_region_2=="Province of Ourense"]<-"Ourense"
Google$sub_region_2[Google$sub_region_2=="Seville"]<-"Sevilla"
Google$sub_region_2[Google$sub_region_2=="Valencia"]<-"Valencia/València"
Google$sub_region_2 <- with(Google, ifelse(grepl("^Almer", sub_region_2),
"Almería", sub_region_2))
# Table check
table(Google$sub_region_2)
##
## Albacete Alicante/Alacant
## 4235 385 385
## Almería Araba/Álava Asturias
## 385 385 385
## Ávila Badajoz Balears, Illes
## 385 385 385
## Barcelona Bizkaia Burgos
## 385 385 385
## Cáceres Cádiz Cantabria
## 385 385 385
## Castellón/Castelló Ceuta Ciudad Real
## 385 378 385
## Córdoba Coruña, A Cuenca
## 385 385 385
## Gipuzkoa Girona Granada
## 385 385 385
## Guadalajara Huelva Huesca
## 385 385 385
## Jaén León Lleida
## 385 385 385
## Lugo Madrid Málaga
## 385 385 385
## Melilla Murcia Navarra
## 379 385 385
## Ourense Palencia Palmas, Las
## 385 385 385
## Pontevedra Rioja, La Salamanca
## 385 385 385
## Santa Cruz de Tenerife Segovia Sevilla
## 385 385 385
## Soria Tarragona Teruel
## 385 385 385
## Toledo Valencia/València Valladolid
## 385 385 385
## Zamora Zaragoza
## 385 385
table(Google$iso_3166_2_code)
##
## CE ES-A ES-AB ES-AL ES-AN ES-AR ES-AV ES-B ES-BA ES-BI ES-BU ES-C
## 385 378 385 385 385 385 385 385 385 385 385 385 385
## ES-CA ES-CC ES-CL ES-CM ES-CN ES-CO ES-CR ES-CS ES-CT ES-CU ES-EX ES-GA ES-GC
## 385 385 385 385 385 385 385 385 385 385 385 385 385
## ES-GI ES-GR ES-GU ES-H ES-HU ES-J ES-L ES-LE ES-LU ES-MA ES-OR ES-P ES-PO
## 385 385 385 385 385 385 385 385 385 385 385 385 385
## ES-PV ES-SA ES-SE ES-SG ES-SO ES-SS ES-T ES-TE ES-TF ES-TO ES-V ES-VA ES-VC
## 385 385 385 385 385 385 385 385 385 385 385 385 385
## ES-VI ES-Z ES-ZA LO M ML MU NA O PM S
## 385 385 385 385 385 379 385 385 385 385 385
We are going to transform / eliminate:
# Transform / eliminate A
Google <- filter(Google, sub_region_1 != "", sub_region_2 != "" )
# Transform / eliminate B
Google$date <- as.Date(Google$date ,format="%d/%m/%Y")
# Transform / eliminate C
Google<-within(Google, rm(country_region_code,
country_region,
metro_area,
census_fips_code,
place_id))
# Transform / eliminate D
Google$iso_3166_2_code <- gsub("ES-", "", Google$iso_3166_2_code)
# We pass from integer to numeric
Google$retail_and_recreation_percent_change_from_baseline <-
as.numeric(Google$retail_and_recreation_percent_change_from_baseline)
Google$grocery_and_pharmacy_percent_change_from_baseline <-as.numeric(Google$grocery_and_pharmacy_percent_change_from_baseline)
Google$parks_percent_change_from_baseline <-as.numeric(Google$parks_percent_change_from_baseline)
Google$transit_stations_percent_change_from_baseline <-as.numeric(Google$transit_stations_percent_change_from_baseline)
Google$workplaces_percent_change_from_baseline <-as.numeric(Google$workplaces_percent_change_from_baseline)
Google$residential_percent_change_from_baseline <-as.numeric(Google$residential_percent_change_from_baseline)
# Check table
head(Google,5)
## sub_region_1 sub_region_2 iso_3166_2_code date
## 1 Andalusia Almería AL 2020-02-15
## 2 Andalusia Almería AL 2020-02-16
## 3 Andalusia Almería AL 2020-02-17
## 4 Andalusia Almería AL 2020-02-18
## 5 Andalusia Almería AL 2020-02-19
## retail_and_recreation_percent_change_from_baseline
## 1 5
## 2 -2
## 3 0
## 4 -3
## 5 -1
## grocery_and_pharmacy_percent_change_from_baseline
## 1 -3
## 2 0
## 3 -2
## 4 -3
## 5 -3
## parks_percent_change_from_baseline
## 1 40
## 2 -2
## 3 3
## 4 -2
## 5 3
## transit_stations_percent_change_from_baseline
## 1 10
## 2 1
## 3 5
## 4 5
## 5 4
## workplaces_percent_change_from_baseline
## 1 1
## 2 1
## 3 3
## 4 3
## 5 3
## residential_percent_change_from_baseline
## 1 -2
## 2 -1
## 3 -1
## 4 0
## 5 0
table(Google$sub_region_2)
##
## Albacete Alicante/Alacant Almería
## 385 385 385
## Araba/Álava Asturias Ávila
## 385 385 385
## Badajoz Balears, Illes Barcelona
## 385 385 385
## Bizkaia Burgos Cáceres
## 385 385 385
## Cádiz Cantabria Castellón/Castelló
## 385 385 385
## Ceuta Ciudad Real Córdoba
## 378 385 385
## Coruña, A Cuenca Gipuzkoa
## 385 385 385
## Girona Granada Guadalajara
## 385 385 385
## Huelva Huesca Jaén
## 385 385 385
## León Lleida Lugo
## 385 385 385
## Madrid Málaga Melilla
## 385 385 379
## Murcia Navarra Ourense
## 385 385 385
## Palencia Palmas, Las Pontevedra
## 385 385 385
## Rioja, La Salamanca Santa Cruz de Tenerife
## 385 385 385
## Segovia Sevilla Soria
## 385 385 385
## Tarragona Teruel Toledo
## 385 385 385
## Valencia/València Valladolid Zamora
## 385 385 385
## Zaragoza
## 385
table(Google$iso_3166_2_code)
##
## A AB AL AV B BA BI BU C CA CC CE CO CR CS CU GC GI GR GU
## 385 385 385 385 385 385 385 385 385 385 385 378 385 385 385 385 385 385 385 385
## H HU J L LE LO LU M MA ML MU NA O OR P PM PO S SA SE
## 385 385 385 385 385 385 385 385 385 379 385 385 385 385 385 385 385 385 385 385
## SG SO SS T TE TF TO V VA VI Z ZA
## 385 385 385 385 385 385 385 385 385 385 385 385
#unique(Google$sub_region_2)
#unique(EM3$Zonas.de.movilidad)
We check missing values.
aggr(Google, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## transit_stations_percent_change_from_baseline 0.041585445
## parks_percent_change_from_baseline 0.015244664
## residential_percent_change_from_baseline 0.013345329
## grocery_and_pharmacy_percent_change_from_baseline 0.013195382
## retail_and_recreation_percent_change_from_baseline 0.002799020
## workplaces_percent_change_from_baseline 0.002099265
## sub_region_1 0.000000000
## sub_region_2 0.000000000
## iso_3166_2_code 0.000000000
## date 0.000000000
Google %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
We generate 6 new dataframes from the 6 features stated in order to imput missing values by province using the approach stated at “imputeTS” library (and also used at EM3).
# Transpose dataframe
Google_retail<-Google[c(2,4,5)]
Google_t_retail<-dcast(Google_retail, date~sub_region_2, fill=NA)
# Visualize missing values
aggr(Google_t_retail, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google_t_retail), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Ceuta 0.06493506
## Melilla 0.06493506
## Soria 0.04935065
## date 0.00000000
## Albacete 0.00000000
## Alicante/Alacant 0.00000000
## Almería 0.00000000
## Araba/Álava 0.00000000
## Asturias 0.00000000
## Ávila 0.00000000
## Badajoz 0.00000000
## Balears, Illes 0.00000000
## Barcelona 0.00000000
## Bizkaia 0.00000000
## Burgos 0.00000000
## Cáceres 0.00000000
## Cádiz 0.00000000
## Cantabria 0.00000000
## Castellón/Castelló 0.00000000
## Ciudad Real 0.00000000
## Córdoba 0.00000000
## Coruña, A 0.00000000
## Cuenca 0.00000000
## Gipuzkoa 0.00000000
## Girona 0.00000000
## Granada 0.00000000
## Guadalajara 0.00000000
## Huelva 0.00000000
## Huesca 0.00000000
## Jaén 0.00000000
## León 0.00000000
## Lleida 0.00000000
## Lugo 0.00000000
## Madrid 0.00000000
## Málaga 0.00000000
## Murcia 0.00000000
## Navarra 0.00000000
## Ourense 0.00000000
## Palencia 0.00000000
## Palmas, Las 0.00000000
## Pontevedra 0.00000000
## Rioja, La 0.00000000
## Salamanca 0.00000000
## Santa Cruz de Tenerife 0.00000000
## Segovia 0.00000000
## Sevilla 0.00000000
## Tarragona 0.00000000
## Teruel 0.00000000
## Toledo 0.00000000
## Valencia/València 0.00000000
## Valladolid 0.00000000
## Zamora 0.00000000
## Zaragoza 0.00000000
Google_t_retail %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert dataframe to ts object
Google_t_retail_ts<-xts(Google_t_retail[-1],Google_t_retail$date)
# Impute the missing values with na_seadec (i.e Ceuta)
imp5 <- na_seadec(Google_t_retail_ts[,16])
ggplot_na_imputations(Google_t_retail_ts[,16], imp5)
# We select na_seadec for the dataset
Google_t_retail_ts <- na_seadec(Google_t_retail_ts)
plot(Google_t_retail_ts[,16])
# We convert the time series object to a dataframe
Google_retail <- ts_df(Google_t_retail_ts)
names(Google_retail)[names(Google_retail) == "id"] <- "sub_region_2"
names(Google_retail)[names(Google_retail) == "time"] <- "Date"
names(Google_retail)[names(Google_retail) == "value"] <-
"retail_and_recreation_percent_change_from_baseline"
###################################################################
# Transpose dataframe
Google_grocery<-Google[c(2,4,6)]
Google_t_grocery<-dcast(Google_grocery, date~sub_region_2, fill=NA)
# Visualize missing values
aggr(Google_t_grocery, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google_t_grocery), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Melilla 0.174025974
## Asturias 0.109090909
## Murcia 0.093506494
## Soria 0.085714286
## Ceuta 0.070129870
## Cantabria 0.025974026
## Albacete 0.007792208
## Araba/Álava 0.007792208
## Ávila 0.007792208
## Burgos 0.007792208
## Cáceres 0.007792208
## Ciudad Real 0.007792208
## Cuenca 0.007792208
## Guadalajara 0.007792208
## Huesca 0.007792208
## Jaén 0.007792208
## León 0.007792208
## Lugo 0.007792208
## Navarra 0.007792208
## Ourense 0.007792208
## Palencia 0.007792208
## Rioja, La 0.007792208
## Salamanca 0.007792208
## Segovia 0.007792208
## Teruel 0.007792208
## Zamora 0.007792208
## Madrid 0.002597403
## Valladolid 0.002597403
## date 0.000000000
## Alicante/Alacant 0.000000000
## Almería 0.000000000
## Badajoz 0.000000000
## Balears, Illes 0.000000000
## Barcelona 0.000000000
## Bizkaia 0.000000000
## Cádiz 0.000000000
## Castellón/Castelló 0.000000000
## Córdoba 0.000000000
## Coruña, A 0.000000000
## Gipuzkoa 0.000000000
## Girona 0.000000000
## Granada 0.000000000
## Huelva 0.000000000
## Lleida 0.000000000
## Málaga 0.000000000
## Palmas, Las 0.000000000
## Pontevedra 0.000000000
## Santa Cruz de Tenerife 0.000000000
## Sevilla 0.000000000
## Tarragona 0.000000000
## Toledo 0.000000000
## Valencia/València 0.000000000
## Zaragoza 0.000000000
Google_t_grocery %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert dataframe to ts object
Google_t_grocery_ts<-xts(Google_t_grocery[-1],Google_t_grocery$date)
# Impute the missing values with na_seadec (i.e Ceuta)
imp6 <- na_seadec(Google_t_grocery_ts[,16])
ggplot_na_imputations(Google_t_grocery_ts[,16], imp6)
# We select na_seadec for the dataset
Google_t_grocery_ts <- na_seadec(Google_t_grocery_ts)
plot(Google_t_grocery_ts[,16])
# We convert the time series object to a dataframe
Google_grocery <- ts_df(Google_t_grocery_ts)
names(Google_grocery)[names(Google_grocery) == "id"] <- "sub_region_2"
names(Google_grocery)[names(Google_grocery) == "time"] <- "Date"
names(Google_grocery)[names(Google_grocery) == "value"] <-
"grocery_and_pharmacy_percent_change_from_baseline"
###################################################################
# Transpose dataframe
Google_parks<-Google[c(2,4,7)]
Google_t_parks<-dcast(Google_parks, date~sub_region_2, fill=NA)
# Visualize missing values
aggr(Google_t_parks, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google_t_parks), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Ceuta 0.064935065
## Melilla 0.064935065
## Palencia 0.064935065
## Soria 0.064935065
## Cuenca 0.062337662
## Ávila 0.059740260
## Teruel 0.057142857
## Huesca 0.054545455
## Zamora 0.051948052
## Cantabria 0.046753247
## Lleida 0.044155844
## Rioja, La 0.044155844
## Burgos 0.033766234
## Guadalajara 0.031168831
## León 0.031168831
## Segovia 0.018181818
## Albacete 0.012987013
## Navarra 0.010389610
## Ourense 0.005194805
## Asturias 0.002597403
## date 0.000000000
## Alicante/Alacant 0.000000000
## Almería 0.000000000
## Araba/Álava 0.000000000
## Badajoz 0.000000000
## Balears, Illes 0.000000000
## Barcelona 0.000000000
## Bizkaia 0.000000000
## Cáceres 0.000000000
## Cádiz 0.000000000
## Castellón/Castelló 0.000000000
## Ciudad Real 0.000000000
## Córdoba 0.000000000
## Coruña, A 0.000000000
## Gipuzkoa 0.000000000
## Girona 0.000000000
## Granada 0.000000000
## Huelva 0.000000000
## Jaén 0.000000000
## Lugo 0.000000000
## Madrid 0.000000000
## Málaga 0.000000000
## Murcia 0.000000000
## Palmas, Las 0.000000000
## Pontevedra 0.000000000
## Salamanca 0.000000000
## Santa Cruz de Tenerife 0.000000000
## Sevilla 0.000000000
## Tarragona 0.000000000
## Toledo 0.000000000
## Valencia/València 0.000000000
## Valladolid 0.000000000
## Zaragoza 0.000000000
Google_t_parks %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert dataframe to ts object
Google_t_parks_ts<-xts(Google_t_parks[-1],Google_t_parks$date)
# Impute the missing values with na_seadec (i.e Ceuta)
imp7 <- na_seadec(Google_t_parks_ts[,16])
ggplot_na_imputations(Google_t_parks_ts[,16], imp7)
# We select na_seadec for the dataset
Google_t_parks_ts <- na_seadec(Google_t_parks_ts)
plot(Google_t_parks_ts[,16])
# We convert the time series object to a dataframe
Google_parks <- ts_df(Google_t_parks_ts)
names(Google_parks)[names(Google_parks) == "id"] <- "sub_region_2"
names(Google_parks)[names(Google_parks) == "time"] <- "Date"
names(Google_parks)[names(Google_parks) == "value"] <-
"parks_percent_change_from_baseline"
###################################################################
# Transpose dataframe
Google_transit<-Google[c(2,4,8)]
Google_t_transit<-dcast(Google_transit, date~sub_region_2, fill=NA)
# Visualize missing values
aggr(Google_t_transit, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google_t_transit), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Ceuta 0.97142857
## Melilla 0.28311688
## Teruel 0.08831169
## Zamora 0.06753247
## Ávila 0.06493506
## Cuenca 0.06493506
## Huelva 0.06493506
## Huesca 0.06493506
## Lugo 0.06493506
## Palencia 0.06493506
## Rioja, La 0.06493506
## Segovia 0.06493506
## Soria 0.06493506
## Ourense 0.06233766
## Burgos 0.05454545
## Cáceres 0.04935065
## Ciudad Real 0.01818182
## Guadalajara 0.01558442
## date 0.00000000
## Albacete 0.00000000
## Alicante/Alacant 0.00000000
## Almería 0.00000000
## Araba/Álava 0.00000000
## Asturias 0.00000000
## Badajoz 0.00000000
## Balears, Illes 0.00000000
## Barcelona 0.00000000
## Bizkaia 0.00000000
## Cádiz 0.00000000
## Cantabria 0.00000000
## Castellón/Castelló 0.00000000
## Córdoba 0.00000000
## Coruña, A 0.00000000
## Gipuzkoa 0.00000000
## Girona 0.00000000
## Granada 0.00000000
## Jaén 0.00000000
## León 0.00000000
## Lleida 0.00000000
## Madrid 0.00000000
## Málaga 0.00000000
## Murcia 0.00000000
## Navarra 0.00000000
## Palmas, Las 0.00000000
## Pontevedra 0.00000000
## Salamanca 0.00000000
## Santa Cruz de Tenerife 0.00000000
## Sevilla 0.00000000
## Tarragona 0.00000000
## Toledo 0.00000000
## Valencia/València 0.00000000
## Valladolid 0.00000000
## Zaragoza 0.00000000
Google_t_transit %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert dataframe to ts object
Google_t_transit_ts<-xts(Google_t_transit[-1],Google_t_transit$date)
# Impute the missing values with na_seadec (i.e Ceuta)
imp8 <- na_seadec(Google_t_transit_ts[,16])
ggplot_na_imputations(Google_t_transit_ts[,16], imp8)
# We select na_seadec for the dataset
Google_t_transit_ts <- na_seadec(Google_t_transit_ts)
plot(Google_t_transit_ts[,16])
# We convert the time series object to a dataframe
Google_transit <- ts_df(Google_t_transit_ts)
names(Google_transit)[names(Google_transit) == "id"] <- "sub_region_2"
names(Google_transit)[names(Google_transit) == "time"] <- "Date"
names(Google_transit)[names(Google_transit) == "value"] <-
"transit_stations_percent_change_from_baseline"
###################################################################
# Transpose dataframe
Google_workplaces<-Google[c(2,4,9)]
Google_t_workplaces<-dcast(Google_workplaces, date~sub_region_2, fill=NA)
# Visualize missing values
aggr(Google_t_workplaces, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google_t_workplaces), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Ceuta 0.06493506
## Melilla 0.06233766
## Soria 0.01558442
## date 0.00000000
## Albacete 0.00000000
## Alicante/Alacant 0.00000000
## Almería 0.00000000
## Araba/Álava 0.00000000
## Asturias 0.00000000
## Ávila 0.00000000
## Badajoz 0.00000000
## Balears, Illes 0.00000000
## Barcelona 0.00000000
## Bizkaia 0.00000000
## Burgos 0.00000000
## Cáceres 0.00000000
## Cádiz 0.00000000
## Cantabria 0.00000000
## Castellón/Castelló 0.00000000
## Ciudad Real 0.00000000
## Córdoba 0.00000000
## Coruña, A 0.00000000
## Cuenca 0.00000000
## Gipuzkoa 0.00000000
## Girona 0.00000000
## Granada 0.00000000
## Guadalajara 0.00000000
## Huelva 0.00000000
## Huesca 0.00000000
## Jaén 0.00000000
## León 0.00000000
## Lleida 0.00000000
## Lugo 0.00000000
## Madrid 0.00000000
## Málaga 0.00000000
## Murcia 0.00000000
## Navarra 0.00000000
## Ourense 0.00000000
## Palencia 0.00000000
## Palmas, Las 0.00000000
## Pontevedra 0.00000000
## Rioja, La 0.00000000
## Salamanca 0.00000000
## Santa Cruz de Tenerife 0.00000000
## Segovia 0.00000000
## Sevilla 0.00000000
## Tarragona 0.00000000
## Teruel 0.00000000
## Toledo 0.00000000
## Valencia/València 0.00000000
## Valladolid 0.00000000
## Zamora 0.00000000
## Zaragoza 0.00000000
Google_t_workplaces %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert dataframe to ts object
Google_t_workplaces_ts<-xts(Google_t_workplaces[-1],Google_t_workplaces$date)
# Impute the missing values with na_seadec (i.e Ceuta)
imp9 <- na_seadec(Google_t_workplaces_ts[,16])
ggplot_na_imputations(Google_t_workplaces_ts[,16], imp9)
# We select na_seadec for the dataset
Google_t_workplaces_ts <- na_seadec(Google_t_workplaces_ts)
plot(Google_t_workplaces_ts[,16])
# We convert the time series object to a dataframe
Google_workplaces <- ts_df(Google_t_workplaces_ts)
names(Google_workplaces)[names(Google_workplaces) == "id"] <- "sub_region_2"
names(Google_workplaces)[names(Google_workplaces) == "time"] <- "Date"
names(Google_workplaces)[names(Google_workplaces) == "value"] <-
"workplaces_percent_change_from_baseline"
###################################################################
# Transpose dataframe
Google_residential<-Google[c(2,4,10)]
Google_t_residential<-dcast(Google_residential, date~sub_region_2, fill=NA)
# Visualize missing values
aggr(Google_t_residential, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google_t_residential), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Ceuta 0.36363636
## Melilla 0.34025974
## Soria 0.02337662
## date 0.00000000
## Albacete 0.00000000
## Alicante/Alacant 0.00000000
## Almería 0.00000000
## Araba/Álava 0.00000000
## Asturias 0.00000000
## Ávila 0.00000000
## Badajoz 0.00000000
## Balears, Illes 0.00000000
## Barcelona 0.00000000
## Bizkaia 0.00000000
## Burgos 0.00000000
## Cáceres 0.00000000
## Cádiz 0.00000000
## Cantabria 0.00000000
## Castellón/Castelló 0.00000000
## Ciudad Real 0.00000000
## Córdoba 0.00000000
## Coruña, A 0.00000000
## Cuenca 0.00000000
## Gipuzkoa 0.00000000
## Girona 0.00000000
## Granada 0.00000000
## Guadalajara 0.00000000
## Huelva 0.00000000
## Huesca 0.00000000
## Jaén 0.00000000
## León 0.00000000
## Lleida 0.00000000
## Lugo 0.00000000
## Madrid 0.00000000
## Málaga 0.00000000
## Murcia 0.00000000
## Navarra 0.00000000
## Ourense 0.00000000
## Palencia 0.00000000
## Palmas, Las 0.00000000
## Pontevedra 0.00000000
## Rioja, La 0.00000000
## Salamanca 0.00000000
## Santa Cruz de Tenerife 0.00000000
## Segovia 0.00000000
## Sevilla 0.00000000
## Tarragona 0.00000000
## Teruel 0.00000000
## Toledo 0.00000000
## Valencia/València 0.00000000
## Valladolid 0.00000000
## Zamora 0.00000000
## Zaragoza 0.00000000
Google_t_residential %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert dataframe to ts object
Google_t_residential_ts<-xts(Google_t_residential[-1],Google_t_residential$date)
# Impute the missing values with na_seadec (i.e Ceuta)
imp10 <- na_seadec(Google_t_residential_ts[,16])
ggplot_na_imputations(Google_t_residential_ts[,16], imp10)
# We select na_seadec for the dataset
Google_t_residential_ts <- na_seadec(Google_t_residential_ts)
plot(Google_t_residential_ts[,16])
# We convert the time series object to a dataframe
Google_residential <- ts_df(Google_t_residential_ts)
names(Google_residential)[names(Google_residential) == "id"] <- "sub_region_2"
names(Google_residential)[names(Google_residential) == "time"] <- "Date"
names(Google_residential)[names(Google_residential) == "value"] <-
"residential_percent_change_from_baseline"
Now we merge the previous dataframes into new one with the imputed vaules and we add the ISO code for the province.
# New dataframe Google_b
# This approach assumes that the column names are the same and that there's the same number of rows (our case) in each data frame you are merging.
# Any duplicated columns are automatically eliminated used in the merging process.
Google_b <- merge(Google_retail, Google_grocery) %>%
merge(Google_parks) %>%
merge(Google_transit) %>%
merge(Google_workplaces) %>%
merge(Google_residential)
# We add the iso code for the province
Google_b$iso_code <- NA
Google_b$iso_code<-Google[match(Google_b$sub_region_2, Google$sub_region_2),3]
rm("Google")
Google<-Google_b
rm("Google_b")
# Check table
head(Google,5)
## sub_region_2 Date retail_and_recreation_percent_change_from_baseline
## 1 Albacete 2020-02-15 3
## 2 Albacete 2020-02-16 5
## 3 Albacete 2020-02-17 -2
## 4 Albacete 2020-02-18 -3
## 5 Albacete 2020-02-19 0
## grocery_and_pharmacy_percent_change_from_baseline
## 1 -5
## 2 1
## 3 3
## 4 -1
## 5 1
## parks_percent_change_from_baseline
## 1 35
## 2 40
## 3 7
## 4 -4
## 5 7
## transit_stations_percent_change_from_baseline
## 1 13
## 2 18
## 3 20
## 4 6
## 5 9
## workplaces_percent_change_from_baseline
## 1 1
## 2 0
## 3 5
## 4 4
## 5 4
## residential_percent_change_from_baseline iso_code
## 1 -3 AB
## 2 -4 AB
## 3 -1 AB
## 4 -1 AB
## 5 -1 AB
table(Google$sub_region_2)
##
## Albacete Alicante/Alacant Almería
## 385 385 385
## Araba/Álava Asturias Ávila
## 385 385 385
## Badajoz Balears, Illes Barcelona
## 385 385 385
## Bizkaia Burgos Cáceres
## 385 385 385
## Cádiz Cantabria Castellón/Castelló
## 385 385 385
## Ceuta Ciudad Real Córdoba
## 385 385 385
## Coruña, A Cuenca Gipuzkoa
## 385 385 385
## Girona Granada Guadalajara
## 385 385 385
## Huelva Huesca Jaén
## 385 385 385
## León Lleida Lugo
## 385 385 385
## Madrid Málaga Melilla
## 385 385 385
## Murcia Navarra Ourense
## 385 385 385
## Palencia Palmas, Las Pontevedra
## 385 385 385
## Rioja, La Salamanca Santa Cruz de Tenerife
## 385 385 385
## Segovia Sevilla Soria
## 385 385 385
## Tarragona Teruel Toledo
## 385 385 385
## Valencia/València Valladolid Zamora
## 385 385 385
## Zaragoza
## 385
table(Google$iso_code)
##
## A AB AL AV B BA BI BU C CA CC CE CO CR CS CU GC GI GR GU
## 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385
## H HU J L LE LO LU M MA ML MU NA O OR P PM PO S SA SE
## 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385
## SG SO SS T TE TF TO V VA VI Z ZA
## 385 385 385 385 385 385 385 385 385 385 385 385
We check missing values. We should obtain zero missing values.
aggr(Google, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Google), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## sub_region_2 0
## Date 0
## retail_and_recreation_percent_change_from_baseline 0
## grocery_and_pharmacy_percent_change_from_baseline 0
## parks_percent_change_from_baseline 0
## transit_stations_percent_change_from_baseline 0
## workplaces_percent_change_from_baseline 0
## residential_percent_change_from_baseline 0
## iso_code 0
Google %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The CSV files are provided per “imputed date” (fecha)":
summary(CNE_tecnica)
## provincia_iso fecha num_casos num_casos_prueba_pcr
## Length:23426 Length:23426 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 2.0 1st Qu.: 2.0
## Mode :character Mode :character Median : 32.0 Median : 26.0
## Mean : 136.9 Mean : 109.6
## 3rd Qu.: 120.0 3rd Qu.: 100.0
## Max. :6972.0 Max. :6546.0
## num_casos_prueba_test_ac num_casos_prueba_ag num_casos_prueba_elisa
## Min. : 0.0000 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.00 Median : 0.0000
## Mean : 0.2037 Mean : 26.21 Mean : 0.1602
## 3rd Qu.: 0.0000 3rd Qu.: 9.00 3rd Qu.: 0.0000
## Max. :32.0000 Max. :3267.00 Max. :71.0000
## num_casos_prueba_desconocida
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.7122
## 3rd Qu.: 0.0000
## Max. :505.0000
head(str(CNE_tecnica, vec.len=3))
## 'data.frame': 23426 obs. of 8 variables:
## $ provincia_iso : chr "A" "AB" "AL" ...
## $ fecha : chr "2020-01-01" "2020-01-01" "2020-01-01" ...
## $ num_casos : int 0 0 0 0 0 0 0 0 ...
## $ num_casos_prueba_pcr : int 0 0 0 0 0 0 0 0 ...
## $ num_casos_prueba_test_ac : int 0 0 0 0 0 0 0 0 ...
## $ num_casos_prueba_ag : int 0 0 0 0 0 0 0 0 ...
## $ num_casos_prueba_elisa : int 0 0 0 0 0 0 0 0 ...
## $ num_casos_prueba_desconocida: int 0 0 0 0 0 0 0 0 ...
## NULL
table(CNE_tecnica$provincia_iso)
##
## A AB AL AV B BA BI BU C CA CC CE CO CR CS CU GC GI GR GU
## 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442
## H HU J L LE LO LU M MA ML MU NC O OR P PM PO S SA SE
## 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442
## SG SO SS T TE TF TO V VA VI Z ZA
## 442 442 442 442 442 442 442 442 442 442 442 442
We check missing values for CNE_tecnica. In this case we omit the NA values.
aggr(CNE_tecnica, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(CNE_tecnica), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## provincia_iso 0.01886792
## fecha 0.00000000
## num_casos 0.00000000
## num_casos_prueba_pcr 0.00000000
## num_casos_prueba_test_ac 0.00000000
## num_casos_prueba_ag 0.00000000
## num_casos_prueba_elisa 0.00000000
## num_casos_prueba_desconocida 0.00000000
CNE_tecnica %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## List of 1
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : num 45
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
####################################
CNE_tecnica <- na.omit(CNE_tecnica)
####################################
aggr(CNE_tecnica, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(CNE_tecnica), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## provincia_iso 0
## fecha 0
## num_casos 0
## num_casos_prueba_pcr 0
## num_casos_prueba_test_ac 0
## num_casos_prueba_ag 0
## num_casos_prueba_elisa 0
## num_casos_prueba_desconocida 0
CNE_tecnica %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
summary(CNE_casos)
## provincia_iso sexo grupo_edad fecha
## Length:702780 Length:702780 Length:702780 Length:702780
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## num_casos num_hosp num_uci num_def
## Min. : 0.000 Min. : 0.0000 Min. : 0.00000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.0000 Median : 0.00000 Median : 0.0000
## Mean : 4.562 Mean : 0.4611 Mean : 0.04117 Mean : 0.1036
## 3rd Qu.: 2.000 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :771.000 Max. :269.0000 Max. :35.00000 Max. :100.0000
head(str(CNE_casos,vec.len=3))
## 'data.frame': 702780 obs. of 8 variables:
## $ provincia_iso: chr "A" "A" "A" ...
## $ sexo : chr "H" "H" "H" ...
## $ grupo_edad : chr "0-9" "10-19" "20-29" ...
## $ fecha : chr "2020-01-01" "2020-01-01" "2020-01-01" ...
## $ num_casos : int 0 0 0 0 0 0 0 0 ...
## $ num_hosp : int 0 0 0 0 0 0 0 0 ...
## $ num_uci : int 0 0 0 0 0 0 0 0 ...
## $ num_def : int 0 0 0 0 0 0 0 0 ...
## NULL
table(CNE_casos$provincia_iso)
##
## A AB AL AV B BA BI BU C CA CC CE CO
## 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260
## CR CS CU GC GI GR GU H HU J L LE LO
## 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260
## LU M MA ML MU NC O OR P PM PO S SA
## 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260
## SE SG SO SS T TE TF TO V VA VI Z ZA
## 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260 13260
We check missing values for CNE_casos. In this case also we omit the NA values.
aggr(CNE_casos, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(CNE_casos), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## provincia_iso 0.01886792
## sexo 0.00000000
## grupo_edad 0.00000000
## fecha 0.00000000
## num_casos 0.00000000
## num_hosp 0.00000000
## num_uci 0.00000000
## num_def 0.00000000
CNE_casos %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
###############################
CNE_casos <- na.omit(CNE_casos)
###############################
aggr(CNE_casos, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(CNE_casos), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## provincia_iso 0
## sexo 0
## grupo_edad 0
## fecha 0
## num_casos 0
## num_hosp 0
## num_uci 0
## num_def 0
CNE_casos %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
We are going to transform / eliminate:
# Transform / eliminate A
CNE_tecnica$fecha <- as.Date(CNE_tecnica$fecha ,format="%Y-%m-%d")
CNE_casos$fecha <- as.Date(CNE_casos$fecha ,format="%Y-%m-%d")
# Transform / eliminate B
CNE_casos<-within(CNE_casos, rm(grupo_edad, sexo))
# Iso code update for Navarra C
CNE_tecnica$provincia_iso[CNE_tecnica$provincia_iso=="NC"] <- "NA"
CNE_casos$provincia_iso[CNE_casos$provincia_iso=="NC"] <- "NA"
# Check table
head(CNE_tecnica,5)
## provincia_iso fecha num_casos num_casos_prueba_pcr
## 1 A 2020-01-01 0 0
## 2 AB 2020-01-01 0 0
## 3 AL 2020-01-01 0 0
## 4 AV 2020-01-01 0 0
## 5 B 2020-01-01 0 0
## num_casos_prueba_test_ac num_casos_prueba_ag num_casos_prueba_elisa
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## num_casos_prueba_desconocida
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
head(CNE_casos,5)
## provincia_iso fecha num_casos num_hosp num_uci num_def
## 1 A 2020-01-01 0 0 0 0
## 2 A 2020-01-01 0 0 0 0
## 3 A 2020-01-01 0 0 0 0
## 4 A 2020-01-01 0 0 0 0
## 5 A 2020-01-01 0 0 0 0
We check both dataframes offers the same total results.
CNE_tecnica %>%
group_by(provincia_iso) %>%
summarise_at(vars(num_casos), sum)
## # A tibble: 52 x 2
## provincia_iso num_casos
## <chr> <int>
## 1 A 143555
## 2 AB 26916
## 3 AL 47032
## 4 AV 11084
## 5 B 382992
## 6 BA 45886
## 7 BI 80588
## 8 BU 29808
## 9 C 51272
## 10 CA 70428
## # ... with 42 more rows
CNE_casos %>%
group_by(provincia_iso) %>%
summarise_at(vars(num_casos), sum)
## # A tibble: 52 x 2
## provincia_iso num_casos
## <chr> <int>
## 1 A 143555
## 2 AB 26916
## 3 AL 47032
## 4 AV 11084
## 5 B 382992
## 6 BA 45886
## 7 BI 80588
## 8 BU 29808
## 9 C 51272
## 10 CA 70428
## # ... with 42 more rows
We proceed to combine the different data sets into one.
Here we merge by columns “provincia_iso”,“fecha”.
# CNE_casos_g
CNE_casos_g = CNE_casos %>%
group_by(provincia_iso, fecha) %>%
summarise_at(vars(num_casos, num_hosp, num_uci, num_def), sum)
head(CNE_casos_g,5)
## # A tibble: 5 x 6
## # Groups: provincia_iso [1]
## provincia_iso fecha num_casos num_hosp num_uci num_def
## <chr> <date> <int> <int> <int> <int>
## 1 A 2020-01-01 0 1 0 0
## 2 A 2020-01-02 0 0 0 0
## 3 A 2020-01-03 0 0 0 0
## 4 A 2020-01-04 0 0 0 0
## 5 A 2020-01-05 0 1 0 0
# New dataframe CNE_tec_cas
CNE_tec_cas<-merge(CNE_tecnica,
CNE_casos_g, by.x=c("provincia_iso","fecha"),
by.y=c("provincia_iso","fecha"))
# We check both dataframes offers the same total results
CNE_tecnica %>%
group_by(provincia_iso) %>%
summarise_at(vars(num_casos), sum)
## # A tibble: 52 x 2
## provincia_iso num_casos
## <chr> <int>
## 1 A 143555
## 2 AB 26916
## 3 AL 47032
## 4 AV 11084
## 5 B 382992
## 6 BA 45886
## 7 BI 80588
## 8 BU 29808
## 9 C 51272
## 10 CA 70428
## # ... with 42 more rows
CNE_casos_g %>%
group_by(provincia_iso) %>%
summarise_at(vars(num_casos), sum)
## # A tibble: 52 x 2
## provincia_iso num_casos
## <chr> <int>
## 1 A 143555
## 2 AB 26916
## 3 AL 47032
## 4 AV 11084
## 5 B 382992
## 6 BA 45886
## 7 BI 80588
## 8 BU 29808
## 9 C 51272
## 10 CA 70428
## # ... with 42 more rows
head(CNE_tec_cas,5)
## provincia_iso fecha num_casos.x num_casos_prueba_pcr
## 1 A 2020-01-01 0 0
## 2 A 2020-01-02 0 0
## 3 A 2020-01-03 0 0
## 4 A 2020-01-04 0 0
## 5 A 2020-01-05 0 0
## num_casos_prueba_test_ac num_casos_prueba_ag num_casos_prueba_elisa
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## num_casos_prueba_desconocida num_casos.y num_hosp num_uci num_def
## 1 0 0 1 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 1 0 0
table(CNE_tec_cas$provincia_iso)
##
## A AB AL AV B BA BI BU C CA CC CE CO CR CS CU GC GI GR GU
## 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442
## H HU J L LE LO LU M MA ML MU NA O OR P PM PO S SA SE
## 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442 442
## SG SO SS T TE TF TO V VA VI Z ZA
## 442 442 442 442 442 442 442 442 442 442 442 442
Here we merge by columns “provincia_iso” / “fecha” and “iso_3166_2_code” / “date”.
# New dataframe GOG_CNE
GOG_CNE<-merge(CNE_tec_cas,
Google,
by.x=c("provincia_iso","fecha"),
by.y=c("iso_code","Date"))
head(GOG_CNE,5)
## provincia_iso fecha num_casos.x num_casos_prueba_pcr
## 1 A 2020-02-15 1 1
## 2 A 2020-02-16 1 1
## 3 A 2020-02-17 1 1
## 4 A 2020-02-18 1 1
## 5 A 2020-02-19 1 1
## num_casos_prueba_test_ac num_casos_prueba_ag num_casos_prueba_elisa
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## num_casos_prueba_desconocida num_casos.y num_hosp num_uci num_def
## 1 0 0 1 0 0
## 2 0 0 0 0 0
## 3 0 0 1 0 0
## 4 0 0 1 0 0
## 5 0 0 2 1 0
## sub_region_2 retail_and_recreation_percent_change_from_baseline
## 1 Alicante/Alacant 3
## 2 Alicante/Alacant -2
## 3 Alicante/Alacant 0
## 4 Alicante/Alacant -5
## 5 Alicante/Alacant 1
## grocery_and_pharmacy_percent_change_from_baseline
## 1 -1
## 2 1
## 3 2
## 4 -2
## 5 1
## parks_percent_change_from_baseline
## 1 34
## 2 8
## 3 9
## 4 -14
## 5 10
## transit_stations_percent_change_from_baseline
## 1 7
## 2 5
## 3 7
## 4 -2
## 5 3
## workplaces_percent_change_from_baseline
## 1 0
## 2 -2
## 3 3
## 4 2
## 5 3
## residential_percent_change_from_baseline
## 1 -1
## 2 -1
## 3 0
## 4 1
## 5 0
table(GOG_CNE$provincia_iso)
##
## A AB AL AV B BA BI BU C CA CC CE CO CR CS CU GC GI GR GU
## 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385
## H HU J L LE LO LU M MA ML MU NA O OR P PM PO S SA SE
## 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385 385
## SG SO SS T TE TF TO V VA VI Z ZA
## 385 385 385 385 385 385 385 385 385 385 385 385
Here we merge by columns “sub_region_2” / “fecha” and “Zonas.de.movilidad” / “Periodo”. With this dataset we have 21 features for study.
# New dataframe Total
Total<-merge(GOG_CNE,
EM3,
by.x=c("sub_region_2","fecha"),
by.y=c("Zonas.de.movilidad","Periodo"))
head(Total,5)
## sub_region_2 fecha provincia_iso num_casos.x num_casos_prueba_pcr
## 1 Albacete 2020-03-16 AB 137 132
## 2 Albacete 2020-03-17 AB 128 123
## 3 Albacete 2020-03-18 AB 114 107
## 4 Albacete 2020-03-19 AB 149 133
## 5 Albacete 2020-03-20 AB 131 121
## num_casos_prueba_test_ac num_casos_prueba_ag num_casos_prueba_elisa
## 1 5 0 0
## 2 5 0 0
## 3 7 0 0
## 4 16 0 0
## 5 10 0 0
## num_casos_prueba_desconocida num_casos.y num_hosp num_uci num_def
## 1 0 65 43 3 7
## 2 0 29 40 4 2
## 3 0 26 24 7 7
## 4 0 22 40 5 7
## 5 0 85 63 4 6
## retail_and_recreation_percent_change_from_baseline
## 1 -81
## 2 -84
## 3 -83
## 4 -93
## 5 -87
## grocery_and_pharmacy_percent_change_from_baseline
## 1 -32
## 2 -41
## 3 -32
## 4 -92
## 5 -34
## parks_percent_change_from_baseline
## 1 -73
## 2 -74
## 3 -70
## 4 -80
## 5 -74
## transit_stations_percent_change_from_baseline
## 1 -66
## 2 -72
## 3 -70
## 4 -86
## 5 -76
## workplaces_percent_change_from_baseline
## 1 -51
## 2 -56
## 3 -58
## 4 -85
## 5 -68
## residential_percent_change_from_baseline Total
## 1 22 9.900
## 2 23 9.705
## 3 23 9.510
## 4 35 9.130
## 5 32 8.750
head(str(Total,vec.len=1))
## 'data.frame': 15080 obs. of 20 variables:
## $ sub_region_2 : chr "Albacete" ...
## $ fecha : Date, format: "2020-03-16" ...
## $ provincia_iso : chr "AB" ...
## $ num_casos.x : int 137 128 ...
## $ num_casos_prueba_pcr : int 132 123 ...
## $ num_casos_prueba_test_ac : int 5 5 ...
## $ num_casos_prueba_ag : int 0 0 ...
## $ num_casos_prueba_elisa : int 0 0 ...
## $ num_casos_prueba_desconocida : int 0 0 ...
## $ num_casos.y : int 65 29 ...
## $ num_hosp : int 43 40 ...
## $ num_uci : int 3 4 ...
## $ num_def : int 7 2 ...
## $ retail_and_recreation_percent_change_from_baseline: num -81 -84 ...
## $ grocery_and_pharmacy_percent_change_from_baseline : num -32 -41 ...
## $ parks_percent_change_from_baseline : num -73 -74 ...
## $ transit_stations_percent_change_from_baseline : num -66 -72 ...
## $ workplaces_percent_change_from_baseline : num -51 -56 ...
## $ residential_percent_change_from_baseline : num 22 23 ...
## $ Total : num 9.9 ...
## NULL
summary(Total)
## sub_region_2 fecha provincia_iso num_casos.x
## Length:15080 Min. :2020-03-16 Length:15080 Min. : 0
## Class :character 1st Qu.:2020-05-27 Class :character 1st Qu.: 5
## Mode :character Median :2020-08-07 Mode :character Median : 39
## Mean :2020-08-07 Mean : 126
## 3rd Qu.:2020-10-19 3rd Qu.: 120
## Max. :2020-12-30 Max. :6565
## num_casos_prueba_pcr num_casos_prueba_test_ac num_casos_prueba_ag
## Min. : 0.0 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 5.0 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 35.0 Median : 0.0000 Median : 0.00
## Mean : 110.2 Mean : 0.2832 Mean : 15.19
## 3rd Qu.: 105.0 3rd Qu.: 0.0000 3rd Qu.: 4.00
## Max. :6546.0 Max. :32.0000 Max. :1465.00
## num_casos_prueba_elisa num_casos_prueba_desconocida num_casos.y
## Min. : 0.0000 Min. : 0.0000 Min. : 0
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 6
## Median : 0.0000 Median : 0.0000 Median : 37
## Mean : 0.1989 Mean : 0.1317 Mean : 127
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 117
## Max. :71.0000 Max. :65.0000 Max. :7724
## num_hosp num_uci num_def
## Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 4.00 Median : 0.000 Median : 1.000
## Mean : 14.86 Mean : 1.281 Mean : 3.437
## 3rd Qu.: 12.00 3rd Qu.: 1.000 3rd Qu.: 3.000
## Max. :1930.00 Max. :135.000 Max. :334.000
## retail_and_recreation_percent_change_from_baseline
## Min. :-97.00
## 1st Qu.:-57.00
## Median :-30.00
## Mean :-37.29
## 3rd Qu.:-17.00
## Max. : 71.00
## grocery_and_pharmacy_percent_change_from_baseline
## Min. :-96.00
## 1st Qu.:-24.00
## Median : -6.00
## Mean :-11.75
## 3rd Qu.: 4.00
## Max. :194.00
## parks_percent_change_from_baseline
## Min. :-94.000
## 1st Qu.:-30.000
## Median : -2.000
## Mean : 5.809
## 3rd Qu.: 30.000
## Max. :543.000
## transit_stations_percent_change_from_baseline
## Min. :-100.00
## 1st Qu.: -53.00
## Median : -31.00
## Mean : -35.19
## 3rd Qu.: -17.00
## Max. : 74.00
## workplaces_percent_change_from_baseline
## Min. :-92.00
## 1st Qu.:-43.00
## Median :-26.00
## Mean :-29.08
## 3rd Qu.:-13.00
## Max. : 55.00
## residential_percent_change_from_baseline Total
## Min. :-10.00 Min. : 1.95
## 1st Qu.: 4.00 1st Qu.:11.36
## Median : 7.00 Median :14.39
## Mean : 10.14 Mean :14.20
## 3rd Qu.: 14.00 3rd Qu.:17.11
## Max. : 48.00 Max. :29.00
Total$num_casos.x <- as.numeric(Total$num_casos.x)
Total$num_casos_prueba_pcr <- as.numeric(Total$num_casos_prueba_pcr)
Total$num_casos_prueba_test_ac <- as.numeric(Total$num_casos_prueba_test_ac)
Total$num_casos_prueba_ag <- as.numeric(Total$num_casos_prueba_ag)
Total$num_casos_prueba_elisa <- as.numeric(Total$num_casos_prueba_elisa)
Total$num_casos_prueba_desconocida <- as.numeric(Total$num_casos_prueba_desconocida)
Total$num_casos.y <- as.numeric(Total$num_casos.y)
Total$num_hosp <- as.numeric(Total$num_hosp)
Total$num_uci <- as.numeric(Total$num_uci)
Total$num_def <- as.numeric(Total$num_def)
table(Total$sub_region_2)
##
## Albacete Alicante/Alacant Almería
## 290 290 290
## Araba/Álava Asturias Ávila
## 290 290 290
## Badajoz Balears, Illes Barcelona
## 290 290 290
## Bizkaia Burgos Cáceres
## 290 290 290
## Cádiz Cantabria Castellón/Castelló
## 290 290 290
## Ceuta Ciudad Real Córdoba
## 290 290 290
## Coruña, A Cuenca Gipuzkoa
## 290 290 290
## Girona Granada Guadalajara
## 290 290 290
## Huelva Huesca Jaén
## 290 290 290
## León Lleida Lugo
## 290 290 290
## Madrid Málaga Melilla
## 290 290 290
## Murcia Navarra Ourense
## 290 290 290
## Palencia Palmas, Las Pontevedra
## 290 290 290
## Rioja, La Salamanca Santa Cruz de Tenerife
## 290 290 290
## Segovia Sevilla Soria
## 290 290 290
## Tarragona Teruel Toledo
## 290 290 290
## Valencia/València Valladolid Zamora
## 290 290 290
## Zaragoza
## 290
table(Total$provincia_iso)
##
## A AB AL AV B BA BI BU C CA CC CE CO CR CS CU GC GI GR GU
## 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290
## H HU J L LE LO LU M MA ML MU NA O OR P PM PO S SA SE
## 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290
## SG SO SS T TE TF TO V VA VI Z ZA
## 290 290 290 290 290 290 290 290 290 290 290 290
We check the missing values. We should have zero missing values
aggr(Total, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(Total), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## sub_region_2 0
## fecha 0
## provincia_iso 0
## num_casos.x 0
## num_casos_prueba_pcr 0
## num_casos_prueba_test_ac 0
## num_casos_prueba_ag 0
## num_casos_prueba_elisa 0
## num_casos_prueba_desconocida 0
## num_casos.y 0
## num_hosp 0
## num_uci 0
## num_def 0
## retail_and_recreation_percent_change_from_baseline 0
## grocery_and_pharmacy_percent_change_from_baseline 0
## parks_percent_change_from_baseline 0
## transit_stations_percent_change_from_baseline 0
## workplaces_percent_change_from_baseline 0
## residential_percent_change_from_baseline 0
## Total 0
Total %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing)) %>%
ggplot() +
geom_bar(aes(x=key, y=num.missing), stat = 'identity',fill="#F0E442") +
labs(x='variable', y="number of missing values",
title='Number of missing values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Review results
# Discrepancies due to different time-frames when merge CNE dataframes (see previous checks)
Total %>%
group_by(provincia_iso) %>%
summarise_at(vars(num_casos.x,num_casos.y), sum)
## # A tibble: 52 x 3
## provincia_iso num_casos.x num_casos.y
## <chr> <dbl> <dbl>
## 1 A 56493 55068
## 2 AB 16459 16626
## 3 AL 21488 21372
## 4 AV 6525 6681
## 5 B 257034 261208
## 6 BA 23165 22612
## 7 BI 56867 57728
## 8 BU 22742 22978
## 9 C 25641 25604
## 10 CA 31593 31225
## # ... with 42 more rows
# CSV file generation
head(Total,5)
## sub_region_2 fecha provincia_iso num_casos.x num_casos_prueba_pcr
## 1 Albacete 2020-03-16 AB 137 132
## 2 Albacete 2020-03-17 AB 128 123
## 3 Albacete 2020-03-18 AB 114 107
## 4 Albacete 2020-03-19 AB 149 133
## 5 Albacete 2020-03-20 AB 131 121
## num_casos_prueba_test_ac num_casos_prueba_ag num_casos_prueba_elisa
## 1 5 0 0
## 2 5 0 0
## 3 7 0 0
## 4 16 0 0
## 5 10 0 0
## num_casos_prueba_desconocida num_casos.y num_hosp num_uci num_def
## 1 0 65 43 3 7
## 2 0 29 40 4 2
## 3 0 26 24 7 7
## 4 0 22 40 5 7
## 5 0 85 63 4 6
## retail_and_recreation_percent_change_from_baseline
## 1 -81
## 2 -84
## 3 -83
## 4 -93
## 5 -87
## grocery_and_pharmacy_percent_change_from_baseline
## 1 -32
## 2 -41
## 3 -32
## 4 -92
## 5 -34
## parks_percent_change_from_baseline
## 1 -73
## 2 -74
## 3 -70
## 4 -80
## 5 -74
## transit_stations_percent_change_from_baseline
## 1 -66
## 2 -72
## 3 -70
## 4 -86
## 5 -76
## workplaces_percent_change_from_baseline
## 1 -51
## 2 -56
## 3 -58
## 4 -85
## 5 -68
## residential_percent_change_from_baseline Total
## 1 22 9.900
## 2 23 9.705
## 3 23 9.510
## 4 35 9.130
## 5 32 8.750
head(str(Total,vec.len=1))
## 'data.frame': 15080 obs. of 20 variables:
## $ sub_region_2 : chr "Albacete" ...
## $ fecha : Date, format: "2020-03-16" ...
## $ provincia_iso : chr "AB" ...
## $ num_casos.x : num 137 128 ...
## $ num_casos_prueba_pcr : num 132 123 ...
## $ num_casos_prueba_test_ac : num 5 5 ...
## $ num_casos_prueba_ag : num 0 0 ...
## $ num_casos_prueba_elisa : num 0 0 ...
## $ num_casos_prueba_desconocida : num 0 0 ...
## $ num_casos.y : num 65 29 ...
## $ num_hosp : num 43 40 ...
## $ num_uci : num 3 4 ...
## $ num_def : num 7 2 ...
## $ retail_and_recreation_percent_change_from_baseline: num -81 -84 ...
## $ grocery_and_pharmacy_percent_change_from_baseline : num -32 -41 ...
## $ parks_percent_change_from_baseline : num -73 -74 ...
## $ transit_stations_percent_change_from_baseline : num -66 -72 ...
## $ workplaces_percent_change_from_baseline : num -51 -56 ...
## $ residential_percent_change_from_baseline : num 22 23 ...
## $ Total : num 9.9 ...
## NULL
summary(Total)
## sub_region_2 fecha provincia_iso num_casos.x
## Length:15080 Min. :2020-03-16 Length:15080 Min. : 0
## Class :character 1st Qu.:2020-05-27 Class :character 1st Qu.: 5
## Mode :character Median :2020-08-07 Mode :character Median : 39
## Mean :2020-08-07 Mean : 126
## 3rd Qu.:2020-10-19 3rd Qu.: 120
## Max. :2020-12-30 Max. :6565
## num_casos_prueba_pcr num_casos_prueba_test_ac num_casos_prueba_ag
## Min. : 0.0 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 5.0 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 35.0 Median : 0.0000 Median : 0.00
## Mean : 110.2 Mean : 0.2832 Mean : 15.19
## 3rd Qu.: 105.0 3rd Qu.: 0.0000 3rd Qu.: 4.00
## Max. :6546.0 Max. :32.0000 Max. :1465.00
## num_casos_prueba_elisa num_casos_prueba_desconocida num_casos.y
## Min. : 0.0000 Min. : 0.0000 Min. : 0
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 6
## Median : 0.0000 Median : 0.0000 Median : 37
## Mean : 0.1989 Mean : 0.1317 Mean : 127
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 117
## Max. :71.0000 Max. :65.0000 Max. :7724
## num_hosp num_uci num_def
## Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 4.00 Median : 0.000 Median : 1.000
## Mean : 14.86 Mean : 1.281 Mean : 3.437
## 3rd Qu.: 12.00 3rd Qu.: 1.000 3rd Qu.: 3.000
## Max. :1930.00 Max. :135.000 Max. :334.000
## retail_and_recreation_percent_change_from_baseline
## Min. :-97.00
## 1st Qu.:-57.00
## Median :-30.00
## Mean :-37.29
## 3rd Qu.:-17.00
## Max. : 71.00
## grocery_and_pharmacy_percent_change_from_baseline
## Min. :-96.00
## 1st Qu.:-24.00
## Median : -6.00
## Mean :-11.75
## 3rd Qu.: 4.00
## Max. :194.00
## parks_percent_change_from_baseline
## Min. :-94.000
## 1st Qu.:-30.000
## Median : -2.000
## Mean : 5.809
## 3rd Qu.: 30.000
## Max. :543.000
## transit_stations_percent_change_from_baseline
## Min. :-100.00
## 1st Qu.: -53.00
## Median : -31.00
## Mean : -35.19
## 3rd Qu.: -17.00
## Max. : 74.00
## workplaces_percent_change_from_baseline
## Min. :-92.00
## 1st Qu.:-43.00
## Median :-26.00
## Mean :-29.08
## 3rd Qu.:-13.00
## Max. : 55.00
## residential_percent_change_from_baseline Total
## Min. :-10.00 Min. : 1.95
## 1st Qu.: 4.00 1st Qu.:11.36
## Median : 7.00 Median :14.39
## Mean : 10.14 Mean :14.20
## 3rd Qu.: 14.00 3rd Qu.:17.11
## Max. : 48.00 Max. :29.00
table(Total$provincia_iso)
##
## A AB AL AV B BA BI BU C CA CC CE CO CR CS CU GC GI GR GU
## 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290
## H HU J L LE LO LU M MA ML MU NA O OR P PM PO S SA SE
## 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290 290
## SG SO SS T TE TF TO V VA VI Z ZA
## 290 290 290 290 290 290 290 290 290 290 290 290
write.csv2(Total,"D:\\UOC Master Data Science\\_ M2.882 - TFM - Área 5\\UOC - Guia - PECS\\Pec3\\Total.csv",
row.names = FALSE)
We have generated some plots from the dataframe object generated.
# Line plots
# All num_casos.x
ggplot(Total, aes(x=fecha, y=num_casos.x, group=sub_region_2)) +
geom_line(aes(linetype=sub_region_2, color=sub_region_2))+
geom_point(aes(color=sub_region_2))+
theme(legend.position="top") +
labs(title="Cases by Province",
x ="Date", y = "Nº of cases")
# All Total (mobility)
ggplot(Total, aes(x=fecha, y=Total, group=sub_region_2)) +
geom_line(aes(linetype=sub_region_2, color=sub_region_2))+
geom_point(aes(color=sub_region_2))+
theme(legend.position="top") +
labs(title="Mobility Change by Province",
x ="Date", y = "% Mobility")
# Mal, Sev and Cad - num_casos.x
Total %>%
filter(sub_region_2 == "Málaga" | sub_region_2 == "Cádiz" |
sub_region_2 == "Sevilla") %>%
ggplot(aes(x=fecha, y=num_casos.x))+
geom_line(aes(color=sub_region_2))+
geom_point(aes(color=sub_region_2))+
theme(legend.position="top") +
labs(title="Cases reported by Province (Málaga, Sevilla and Cádiz)",
x ="Date", y = "Nº of cases")
# Mal, Sev and Cad - Total (mobility)
Total %>%
filter(sub_region_2 == "Málaga" | sub_region_2 == "Cádiz" |
sub_region_2 == "Sevilla") %>%
ggplot(aes(x=fecha, y=Total))+
geom_line(aes(color=sub_region_2))+
geom_point(aes(color=sub_region_2))+
theme(legend.position="top") +
labs(title="Mobility Change by Province (Málaga, Sevilla and Cádiz)",
x ="Date", y = "% Mobility")
We have generated some plots from the time-series object generated. We use tsibble().
# Convert dataframe to ts object
#library(fpp3)
Total_ts <- Total[-3] %>%
mutate(Dia_c = as_date(fecha)) %>%
select(-fecha) %>%
as_tsibble(key = c(sub_region_2),
index = Dia_c)
# Filter for Bar, Mad, Mal, Cor and, Cad
Total_ts %>% filter(sub_region_2 == "Barcelona" | sub_region_2 == "Madrid" |
sub_region_2 == "Málaga" | sub_region_2 == "Sevilla" |
sub_region_2 == "Cádiz") -> Total_ts_b
############################################
Total_ts
## # A tsibble: 15,080 x 19 [1D]
## # Key: sub_region_2 [52]
## sub_region_2 num_casos.x num_casos_prueb~ num_casos_prueb~ num_casos_prueb~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Albacete 137 132 5 0
## 2 Albacete 128 123 5 0
## 3 Albacete 114 107 7 0
## 4 Albacete 149 133 16 0
## 5 Albacete 131 121 10 0
## 6 Albacete 129 120 9 0
## 7 Albacete 125 112 13 0
## 8 Albacete 112 103 9 0
## 9 Albacete 107 91 16 0
## 10 Albacete 78 64 14 0
## # ... with 15,070 more rows, and 14 more variables:
## # num_casos_prueba_elisa <dbl>, num_casos_prueba_desconocida <dbl>,
## # num_casos.y <dbl>, num_hosp <dbl>, num_uci <dbl>, num_def <dbl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>, Total <dbl>, Dia_c <date>
Total_ts_b
## # A tsibble: 1,450 x 19 [1D]
## # Key: sub_region_2 [5]
## sub_region_2 num_casos.x num_casos_prueb~ num_casos_prueb~ num_casos_prueb~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Barcelona 1424 1351 0 0
## 2 Barcelona 1309 1273 0 0
## 3 Barcelona 1420 1379 0 0
## 4 Barcelona 1338 1289 0 0
## 5 Barcelona 1614 1557 0 0
## 6 Barcelona 1164 1136 0 0
## 7 Barcelona 1065 1032 0 0
## 8 Barcelona 1234 1187 0 0
## 9 Barcelona 1137 1095 0 0
## 10 Barcelona 1159 1131 0 0
## # ... with 1,440 more rows, and 14 more variables:
## # num_casos_prueba_elisa <dbl>, num_casos_prueba_desconocida <dbl>,
## # num_casos.y <dbl>, num_hosp <dbl>, num_uci <dbl>, num_def <dbl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>, Total <dbl>, Dia_c <date>
Total_ts_b %>% distinct(sub_region_2)
## # A tibble: 5 x 1
## sub_region_2
## <chr>
## 1 Barcelona
## 2 Cádiz
## 3 Madrid
## 4 Málaga
## 5 Sevilla
###########################################
# Plots
# A num_casos.x,num_casos.y
autoplot(Total_ts_b, vars(num_casos.x,num_casos.y)) +
labs(y = "Nº of Cases",
title = "Reported Cases (CNE num_casos.x vs num_casos.y)")
# B Total (mobility)
autoplot(Total_ts_b, Total) +
facet_wrap(~sub_region_2, scales = "free_y", ncol=2) +
theme(legend.position = "top") +
scale_x_date(date_minor_breaks = "1 day", name = "Time (Daily)") +
ggtitle(label = "Mobility Change by Province (Barcelona, Madrid, Málaga,
Córdoba and Cádiz)")
# C sub_region_2 == "Barcelona" by month
Total_ts %>% filter(sub_region_2 == "Barcelona") %>%
gg_season(num_casos.x, period = "month", labels = "both") +
theme(legend.position = "top") +
labs(y="Nº of Cases", title="Barcelona - Infections by Month")
# D Google (% residential change)
autoplot(Total_ts_b, residential_percent_change_from_baseline) +
facet_wrap(~sub_region_2, scales = "free_y", ncol=2) +
theme(legend.position = "top") +
scale_x_date(date_minor_breaks = "1 day", name = "Time (Daily)") +
ggtitle(label = "Google % residential change (Barcelona, Madrid, Málaga,
Córdoba and Cádiz)")
# Filter to "sub_region_2" == "Barcelona" or "Málaga"
# Character / date columns are eliminated
#library(corrplot)
#### Málaga
# pearson
Total.res<-Total %>%
filter(sub_region_2 == "Málaga")
Total.res<-cor(Total.res[,c(-1,-2,-3)],method="pearson")
corrplot.mixed(Total.res,upper="circle",number.cex=.65,tl.cex=.6, title="Málaga -
pearson ")
# spearman
Total.res<-Total %>%
filter(sub_region_2 == "Málaga")
Total.res<-cor(Total.res[,c(-1,-2,-3)],method="spearman")
corrplot.mixed(Total.res,upper="circle",number.cex=.65,tl.cex=.6, title="Málaga -
spearman ")
# kendall
Total.res<-Total %>%
filter(sub_region_2 == "Málaga")
Total.res<-cor(Total.res[,c(-1,-2,-3)],method="kendall")
corrplot.mixed(Total.res,upper="circle",number.cex=.65,tl.cex=.6, title="Málaga -
kendall ")
#### Barcelona
# pearson
Total.res<-Total %>%
filter(sub_region_2 == "Barcelona")
Total.res<-cor(Total.res[,c(-1,-2,-3)],method="pearson")
corrplot.mixed(Total.res,upper="circle",number.cex=.65,tl.cex=.6, title="Barcelona -
pearson ")
# spearman
Total.res<-Total %>%
filter(sub_region_2 == "Barcelona")
Total.res<-cor(Total.res[,c(-1,-2,-3)],method="spearman")
corrplot.mixed(Total.res,upper="circle",number.cex=.65,tl.cex=.6, title="Barcelona -
spearman ")
# kendall
Total.res<-Total %>%
filter(sub_region_2 == "Barcelona")
Total.res<-cor(Total.res[,c(-1,-2,-3)],method="kendall")
corrplot.mixed(Total.res,upper="circle",number.cex=.65,tl.cex=.6, title="Barcelona -
kendall ")
pca <- prcomp(Total.res, scale = T)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.0590 1.8914 1.12142 1.02409 0.83504 0.57825 0.46266
## Proportion of Variance 0.5504 0.2104 0.07398 0.06169 0.04102 0.01967 0.01259
## Cumulative Proportion 0.5504 0.7609 0.83485 0.89655 0.93756 0.95723 0.96982
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.41006 0.33620 0.28445 0.21986 0.17734 0.16911 0.15372
## Proportion of Variance 0.00989 0.00665 0.00476 0.00284 0.00185 0.00168 0.00139
## Cumulative Proportion 0.97971 0.98636 0.99112 0.99397 0.99582 0.99750 0.99889
## PC15 PC16 PC17
## Standard deviation 0.1369 0.01274 1.826e-17
## Proportion of Variance 0.0011 0.00001 0.000e+00
## Cumulative Proportion 1.0000 1.00000 1.000e+00
pca$rotation
## PC1 PC2
## num_casos.x -0.1354826901 -0.469400471
## num_casos_prueba_pcr -0.1356873829 -0.469237915
## num_casos_prueba_test_ac -0.0782958432 0.237502756
## num_casos_prueba_ag 0.0005323686 0.008790618
## num_casos_prueba_elisa 0.0495912937 0.004385553
## num_casos_prueba_desconocida -0.2907430443 0.113686282
## num_casos.y -0.1546876637 -0.450080493
## num_hosp -0.2956764064 -0.199498016
## num_uci -0.2788863801 -0.230508957
## num_def -0.3153112111 -0.030904089
## retail_and_recreation_percent_change_from_baseline 0.3077977291 -0.116093786
## grocery_and_pharmacy_percent_change_from_baseline 0.2553316609 -0.261732823
## parks_percent_change_from_baseline 0.3128404870 0.012776948
## transit_stations_percent_change_from_baseline 0.2855716515 -0.218931952
## workplaces_percent_change_from_baseline 0.2622358364 -0.170838668
## residential_percent_change_from_baseline -0.2895648649 0.166332144
## Total 0.2993175108 -0.081611685
## PC3 PC4
## num_casos.x -0.002875288 -0.028445983
## num_casos_prueba_pcr -0.012292151 -0.026795983
## num_casos_prueba_test_ac -0.392712472 -0.268237834
## num_casos_prueba_ag -0.034460138 0.950253303
## num_casos_prueba_elisa 0.811470962 -0.060011278
## num_casos_prueba_desconocida 0.019674417 -0.078105637
## num_casos.y 0.041668217 -0.036946432
## num_hosp -0.051419434 -0.010413059
## num_uci -0.018126448 0.006323566
## num_def -0.031133681 0.009747098
## retail_and_recreation_percent_change_from_baseline 0.033200491 -0.075398568
## grocery_and_pharmacy_percent_change_from_baseline 0.027494995 -0.030995413
## parks_percent_change_from_baseline 0.029299658 -0.011018427
## transit_stations_percent_change_from_baseline -0.099387689 -0.018021579
## workplaces_percent_change_from_baseline -0.303553292 0.028981871
## residential_percent_change_from_baseline 0.211975965 0.003369575
## Total 0.173800838 -0.065471155
## PC5 PC6
## num_casos.x -0.15653475 0.16549669
## num_casos_prueba_pcr -0.15256567 0.17028866
## num_casos_prueba_test_ac -0.81534671 -0.10365823
## num_casos_prueba_ag -0.25969423 -0.01360833
## num_casos_prueba_elisa -0.34805371 -0.37185910
## num_casos_prueba_desconocida 0.20580692 -0.11429059
## num_casos.y -0.15827091 0.15297764
## num_hosp 0.03855580 -0.11688342
## num_uci 0.06172358 -0.06173530
## num_def 0.08791553 -0.26888200
## retail_and_recreation_percent_change_from_baseline -0.02731712 0.18700422
## grocery_and_pharmacy_percent_change_from_baseline 0.01445198 -0.34534708
## parks_percent_change_from_baseline -0.01705528 0.39173313
## transit_stations_percent_change_from_baseline 0.03520761 -0.26003250
## workplaces_percent_change_from_baseline 0.10943614 -0.48832683
## residential_percent_change_from_baseline 0.03104587 0.13040337
## Total -0.05468582 0.19337305
## PC7 PC8
## num_casos.x 0.11538168 0.07515692
## num_casos_prueba_pcr 0.11048811 0.07713501
## num_casos_prueba_test_ac -0.08510797 -0.12117026
## num_casos_prueba_ag -0.03926649 -0.14832473
## num_casos_prueba_elisa 0.24859208 0.01374512
## num_casos_prueba_desconocida 0.19845835 -0.75630827
## num_casos.y 0.04117476 -0.02860659
## num_hosp -0.11367668 -0.07611704
## num_uci -0.02723992 -0.20583241
## num_def -0.20890547 0.15436921
## retail_and_recreation_percent_change_from_baseline -0.10130686 -0.31700087
## grocery_and_pharmacy_percent_change_from_baseline -0.63305922 -0.04093499
## parks_percent_change_from_baseline 0.15076652 0.04293438
## transit_stations_percent_change_from_baseline -0.06852096 -0.19335324
## workplaces_percent_change_from_baseline 0.37588850 0.19906444
## residential_percent_change_from_baseline -0.39494991 0.23079899
## Total -0.26415699 -0.26912855
## PC9 PC10
## num_casos.x -0.204798939 -0.059620970
## num_casos_prueba_pcr -0.206074966 -0.057649643
## num_casos_prueba_test_ac 0.031408747 0.010328003
## num_casos_prueba_ag -0.027397924 -0.004813823
## num_casos_prueba_elisa 0.085159377 -0.047189805
## num_casos_prueba_desconocida -0.349092565 -0.125363275
## num_casos.y -0.130984317 0.031574368
## num_hosp 0.283065654 0.067094187
## num_uci 0.717067401 0.169083317
## num_def -0.009119052 0.013475201
## retail_and_recreation_percent_change_from_baseline 0.099531350 -0.317145228
## grocery_and_pharmacy_percent_change_from_baseline -0.110976209 -0.296856620
## parks_percent_change_from_baseline 0.283874181 -0.240873233
## transit_stations_percent_change_from_baseline 0.094114251 -0.074265913
## workplaces_percent_change_from_baseline -0.125028631 0.240735868
## residential_percent_change_from_baseline -0.178700468 0.013386663
## Total -0.118124812 0.793981499
## PC11 PC12
## num_casos.x -0.049617465 -0.0988357734
## num_casos_prueba_pcr -0.044726847 -0.1058640000
## num_casos_prueba_test_ac -0.042562355 -0.0389190655
## num_casos_prueba_ag 0.003758095 -0.0008550931
## num_casos_prueba_elisa 0.023430627 -0.0013867741
## num_casos_prueba_desconocida -0.154872738 -0.1615965585
## num_casos.y 0.033429358 0.1359821524
## num_hosp 0.302911164 0.0671255962
## num_uci -0.400459562 0.0477114671
## num_def 0.553043315 -0.1950704924
## retail_and_recreation_percent_change_from_baseline 0.286708424 0.6275693247
## grocery_and_pharmacy_percent_change_from_baseline -0.348092535 -0.0594956464
## parks_percent_change_from_baseline -0.082227368 -0.4958947989
## transit_stations_percent_change_from_baseline 0.264287632 -0.4433548371
## workplaces_percent_change_from_baseline -0.216784094 0.2008296543
## residential_percent_change_from_baseline -0.278596900 0.0313367951
## Total 0.074087201 -0.0830357636
## PC13 PC14
## num_casos.x 0.3503737964 -0.03447444
## num_casos_prueba_pcr 0.3691954013 -0.03383423
## num_casos_prueba_test_ac -0.0023721487 -0.01431289
## num_casos_prueba_ag 0.0009495141 -0.01895981
## num_casos_prueba_elisa 0.0189278939 -0.03470038
## num_casos_prueba_desconocida -0.0341231415 -0.11380732
## num_casos.y -0.8087585769 0.15136854
## num_hosp -0.0093409355 -0.32920173
## num_uci 0.0905105752 0.12476243
## num_def -0.0179131471 -0.20425404
## retail_and_recreation_percent_change_from_baseline 0.1591640063 0.05248551
## grocery_and_pharmacy_percent_change_from_baseline -0.0689717137 -0.30356466
## parks_percent_change_from_baseline -0.1967768342 -0.33947866
## transit_stations_percent_change_from_baseline 0.0373896719 0.66046288
## workplaces_percent_change_from_baseline -0.0493969622 -0.09490414
## residential_percent_change_from_baseline 0.0240600499 0.34670086
## Total 0.0590314137 -0.13712447
## PC15 PC16
## num_casos.x 0.021886709 -0.710827710
## num_casos_prueba_pcr 0.020989875 0.696614910
## num_casos_prueba_test_ac 0.002128322 -0.009045443
## num_casos_prueba_ag -0.001031653 -0.006169593
## num_casos_prueba_elisa -0.019875868 0.004055133
## num_casos_prueba_desconocida -0.000227830 -0.014131490
## num_casos.y 0.065935929 0.012850260
## num_hosp -0.738022542 -0.005532761
## num_uci 0.291185664 -0.005062923
## num_def 0.517898894 -0.032682612
## retail_and_recreation_percent_change_from_baseline 0.103782290 -0.031882076
## grocery_and_pharmacy_percent_change_from_baseline 0.033876519 0.014999252
## parks_percent_change_from_baseline -0.032945065 -0.039812604
## transit_stations_percent_change_from_baseline -0.169052448 -0.008281975
## workplaces_percent_change_from_baseline -0.059237981 -0.040905347
## residential_percent_change_from_baseline -0.223342981 -0.056238440
## Total 0.040900018 -0.009746369
## PC17
## num_casos.x -0.04694046
## num_casos_prueba_pcr 0.08912178
## num_casos_prueba_test_ac 0.09544863
## num_casos_prueba_ag 0.05828430
## num_casos_prueba_elisa 0.04498527
## num_casos_prueba_desconocida 0.16508413
## num_casos.y -0.01242631
## num_hosp 0.04584112
## num_uci 0.06322805
## num_def 0.30912666
## retail_and_recreation_percent_change_from_baseline 0.33484091
## grocery_and_pharmacy_percent_change_from_baseline -0.13871900
## parks_percent_change_from_baseline 0.41803506
## transit_stations_percent_change_from_baseline 0.04834211
## workplaces_percent_change_from_baseline 0.44739326
## residential_percent_change_from_baseline 0.57600490
## Total 0.07657033
if(!require(FactoMineR)){
install.packages('FactoMineR', repos='http://cran.us.r-project.org')
library(FactoMineR)}
if(!require(factoextra)){
install.packages('factoextra', repos='http://cran.us.r-project.org')
library(factoextra)}
# Var contribution for PC1-PC5
fviz_contrib(pca, choice = "var", axes = 1)
fviz_contrib(pca, choice = "var", axes = 2)
fviz_contrib(pca, choice = "var", axes = 3)
fviz_contrib(pca, choice = "var", axes = 4)
fviz_contrib(pca, choice = "var", axes = 5)
# Check for Barcelona
# Raw
Total %>%
filter(sub_region_2 == "Barcelona") -> Total_bar
par(mfrow=c(2,2))
hist(Total_bar$num_casos.x)
hist(Total_bar$num_casos.y)
qqnorm(Total_bar$num_casos.x, main="Nº Cases X")
qqline(Total_bar$num_casos.x,col=2)
qqnorm(Total_bar$num_casos.y, main="Nº Cases Y")
qqline(Total_bar$num_casos.y,col=2)
# Normalize
library(DescTools)
norm_num_casos.x <- BoxCox(Total_bar$num_casos.x, lambda =
BoxCoxLambda(Total_bar$num_casos.x))
norm_num_casos.y <- BoxCox(Total_bar$num_casos.y, lambda =
BoxCoxLambda(Total_bar$num_casos.y))
hist(norm_num_casos.x)
hist(norm_num_casos.y)
qqnorm(norm_num_casos.x, main="Nº Cases X")
qqline(norm_num_casos.x,col=2)
qqnorm(norm_num_casos.y, main="Nº Cases Y")
qqline(norm_num_casos.y,col=2)
# Columns removal according PCA results and SME knowledge
Total <- Total[c(-3,-6,-7,-8,-10)]
Total_bar <- Total_bar[c(-3,-6,-7,-8,-10)]
Total_ts <- Total_ts[c(-4,-5,-6,-8)]
Total_ts_b <- Total_ts_b[c(-4,-5,-6,-8)]
table(Total_ts$sub_region_2)
##
## Albacete Alicante/Alacant Almería
## 290 290 290
## Araba/Álava Asturias Ávila
## 290 290 290
## Badajoz Balears, Illes Barcelona
## 290 290 290
## Bizkaia Burgos Cáceres
## 290 290 290
## Cádiz Cantabria Castellón/Castelló
## 290 290 290
## Ceuta Ciudad Real Córdoba
## 290 290 290
## Coruña, A Cuenca Gipuzkoa
## 290 290 290
## Girona Granada Guadalajara
## 290 290 290
## Huelva Huesca Jaén
## 290 290 290
## León Lleida Lugo
## 290 290 290
## Madrid Málaga Melilla
## 290 290 290
## Murcia Navarra Ourense
## 290 290 290
## Palencia Palmas, Las Pontevedra
## 290 290 290
## Rioja, La Salamanca Santa Cruz de Tenerife
## 290 290 290
## Segovia Sevilla Soria
## 290 290 290
## Tarragona Teruel Toledo
## 290 290 290
## Valencia/València Valladolid Zamora
## 290 290 290
## Zaragoza
## 290
#str(Total_ts)
summary(Total_ts)
## sub_region_2 num_casos.x num_casos_prueba_pcr
## Length:15080 Min. : 0 Min. : 0.0
## Class :character 1st Qu.: 5 1st Qu.: 5.0
## Mode :character Median : 39 Median : 35.0
## Mean : 126 Mean : 110.2
## 3rd Qu.: 120 3rd Qu.: 105.0
## Max. :6565 Max. :6546.0
## num_casos_prueba_desconocida num_hosp num_uci
## Min. : 0.0000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.00 1st Qu.: 0.000
## Median : 0.0000 Median : 4.00 Median : 0.000
## Mean : 0.1317 Mean : 14.86 Mean : 1.281
## 3rd Qu.: 0.0000 3rd Qu.: 12.00 3rd Qu.: 1.000
## Max. :65.0000 Max. :1930.00 Max. :135.000
## num_def retail_and_recreation_percent_change_from_baseline
## Min. : 0.000 Min. :-97.00
## 1st Qu.: 0.000 1st Qu.:-57.00
## Median : 1.000 Median :-30.00
## Mean : 3.437 Mean :-37.29
## 3rd Qu.: 3.000 3rd Qu.:-17.00
## Max. :334.000 Max. : 71.00
## grocery_and_pharmacy_percent_change_from_baseline
## Min. :-96.00
## 1st Qu.:-24.00
## Median : -6.00
## Mean :-11.75
## 3rd Qu.: 4.00
## Max. :194.00
## parks_percent_change_from_baseline
## Min. :-94.000
## 1st Qu.:-30.000
## Median : -2.000
## Mean : 5.809
## 3rd Qu.: 30.000
## Max. :543.000
## transit_stations_percent_change_from_baseline
## Min. :-100.00
## 1st Qu.: -53.00
## Median : -31.00
## Mean : -35.19
## 3rd Qu.: -17.00
## Max. : 74.00
## workplaces_percent_change_from_baseline
## Min. :-92.00
## 1st Qu.:-43.00
## Median :-26.00
## Mean :-29.08
## 3rd Qu.:-13.00
## Max. : 55.00
## residential_percent_change_from_baseline Total Dia_c
## Min. :-10.00 Min. : 1.95 Min. :2020-03-16
## 1st Qu.: 4.00 1st Qu.:11.36 1st Qu.:2020-05-27
## Median : 7.00 Median :14.39 Median :2020-08-07
## Mean : 10.14 Mean :14.20 Mean :2020-08-07
## 3rd Qu.: 14.00 3rd Qu.:17.11 3rd Qu.:2020-10-19
## Max. : 48.00 Max. :29.00 Max. :2020-12-30
# Total_ts plots for the selected variables
# % of mobility reported by INE and Google (EM3 study)
Total_ts_b %>%
filter(sub_region_2 == "Barcelona") %>%
pivot_longer(c(2,13,14)) %>%
ggplot(aes(x = Dia_c, y = value)) +
geom_line() +
facet_grid(vars(name), scales = "free_y")+
labs(title = "Bar - Nº cases (CNE) vs % Residentail (Google) and Tot (INE)
mobility change")
# Barcelona Seasonal plot: Nº cases
Total_ts_b %>%
filter(sub_region_2 == "Barcelona") %>%
gg_season(num_casos.x, labels = "both") +
labs(y = "Nº cases",
title = "Barcelona Seasonal plot: Nº cases")
# Barcelona Seasonal plot: Nº cases by month
Total_ts_b %>%
filter(sub_region_2 == "Barcelona") %>%
gg_season(num_casos.x, period = "month") +
theme(legend.position = "none") +
labs(y="Nº cases", title="Barcelona Seasonal plot: Nº cases - month")
# Barcelona scatter plot Nº cases vs residential_percent_change
Total_ts_b %>%
filter(sub_region_2 == "Barcelona") %>%
ggplot(aes(x = num_casos.x, y = residential_percent_change_from_baseline )) +
geom_point() +
labs(x = "num_casos.x)",
y = "residential_percent_change",
title="Barcelona scatter plot Nº cases vs residential_percent_chang")
#######################################
# A - Nº cases per province (Barcelona, Madrid, Málaga, Córdoba and Cádiz)
autoplot(Total_ts_b, num_casos.x) +
labs(y = "Nº cases",
title = "Nº cases per province")
# B - Nº cases per province (Barcelona, Madrid, Málaga, Córdoba and Cádiz)
Total_ts_b %>%
group_by(sub_region_2) %>%
summarise(CASOS = sum(num_casos.x))%>%
ggplot(aes(x = Dia_c, y = CASOS)) +
geom_line() +
facet_grid(vars(sub_region_2), scales = "free_y") +
labs(title = "Nº cases per province", y= "Nº cases")
# B.b - Google % change residential mobility per province (Barcelona, Madrid, Málaga, Cádiz and Sevilla)
Total_ts_b %>%
group_by(sub_region_2) %>%
summarise(per_c = (residential_percent_change_from_baseline))%>%
ggplot(aes(x = Dia_c, y = per_c)) +
geom_line() +
facet_grid(vars(sub_region_2), scales = "free_y") +
labs(title = "Google % change residential mobility per province", y= "% evolution")
# B.c - EM3 % change residential mobility per province (Barcelona, Madrid, Málaga, Cádiz and Sevilla)
Total_ts_b %>%
group_by(sub_region_2) %>%
summarise(per_c = (Total))%>%
ggplot(aes(x = Dia_c, y = per_c)) +
geom_line() +
facet_grid(vars(sub_region_2), scales = "free_y") +
labs(title = "EM3 % change residential mobility per province", y= "% evolution")
# % residential percent change (Barcelona, Madrid, Málaga, Cádiz and Sevilla)
autoplot(Total_ts_b, residential_percent_change_from_baseline ) +
labs(y = "% residential percent change",
title = "Residential percent change")
# Nº cases per province and day of week + mean
Total_ts_b %>%
gg_subseries(num_casos.x, period = "week") +
labs(y = "Nº cases + mean",
title = "Nº cases per province and day of week + mean")
# Correlation plot / nº cases by province
Total_ts_b %>%
group_by(sub_region_2) %>%
summarise(CASOS = sum(num_casos.x))%>%
pivot_wider(values_from=CASOS, names_from=sub_region_2) %>%
GGally::ggpairs(2:6)
# % of mobility reported by Google - residential_percent_change
autoplot(Total_ts_b, residential_percent_change_from_baseline) +
facet_wrap(~sub_region_2, scales = "free_y", ncol=2) +
theme(legend.position = "top") +
scale_x_date(date_minor_breaks = "1 day", name = "Time (Daily)") +
ggtitle(label = "% of mobility reported by Google - home (Barcelona, Madrid, Málaga, Cádiz and Sevilla)")
As stated by (Hyndman and Athanasopoulos 2021)… "STL has several advantages over classical decomposition, and the SEATS and X-11 methods:
# Check Seasonal and trend
#Total_ts %>%
# filter(sub_region_2 == "Barcelona") %>%
# model(STL(num_casos.x)) %>%
# components() %>%
# autoplot()
#Total_ts %>%
# filter(sub_region_2 == "Barcelona") %>%
# model(STL(num_casos.x ~ season(window = 7))) %>%
# components() %>%
# autoplot()
Total_ts %>%
#filter_index("2020-09-1" ~ "2020-12-31") %>%
filter(sub_region_2 == "Barcelona") %>%
model(STL(num_casos.x ~ season(window = 7) +
trend(window = 7))) %>%
components() %>%
autoplot() + labs(title="Barcelona")
Total_ts %>%
#filter_index("2020-09-1" ~ "2020-12-31") %>%
filter(sub_region_2 == "Madrid") %>%
model(STL(num_casos.x ~ season(window = 7) +
trend(window = 7))) %>%
components() %>%
autoplot() + labs(title="Madrid")
Total_ts %>%
#filter_index("2020-09-1" ~ "2020-12-31") %>%
filter(sub_region_2 == "Málaga") %>%
model(STL(num_casos.x ~ season(window = 7) +
trend(window = 7))) %>%
components() %>%
autoplot() + labs(title="Málaga")
Total_ts %>%
#filter_index("2020-09-1" ~ "2020-12-31") %>%
filter(sub_region_2 == "Cádiz") %>%
model(STL(num_casos.x ~ season(window = 7) +
trend(window = 7))) %>%
components() %>%
autoplot() + labs(title="Cádiz")
Total_ts %>%
#filter_index("2020-09-1" ~ "2020-12-31") %>%
filter(sub_region_2 == "Sevilla") %>%
model(STL(num_casos.x ~ season(window = 7) +
trend(window = 7))) %>%
components() %>%
autoplot() + labs(title="Sevilla")
As stated by (Hyndman and Athanasopoulos 2021)… “ACF plot is also useful for identifying non-stationary time series. For a stationary time series, the ACF will drop to zero relatively quickly, while the ACF of non-stationary data decreases slowly. Also, for non-stationary data, the value of r 1 is often large and positive… PACF partial autocorrelations. These measure the relationship between yt and yt−k after removing the effects of lags 1,2,3,…,k−1.”
# New time-series for Bar, Mad, Mal, Cor and, Cad
Total_ts %>%
filter_index("2020-03-15" ~ "2020-12-31") %>%
#filter_index("2020-07-01" ~ "2020-12-31") %>%
filter(sub_region_2 == "Barcelona")-> Bar_N_cases #%>%
Total_ts %>%
filter_index("2020-03-15" ~ "2020-12-31") %>%
#filter_index("2020-07-01" ~ "2020-12-31") %>%
filter(sub_region_2 == "Madrid")-> Mad_N_cases #%>%
Total_ts %>%
filter_index("2020-03-15" ~ "2020-12-31") %>%
#filter_index("2020-07-01" ~ "2020-12-31") %>%
filter(sub_region_2 == "Málaga")-> Mal_N_cases #%>%
Total_ts %>%
filter_index("2020-03-15" ~ "2020-12-31") %>%
#filter_index("2020-07-01" ~ "2020-12-31") %>%
filter(sub_region_2 == "Cádiz")-> Cad_N_cases #%>%
Total_ts %>%
filter_index("2020-03-15" ~ "2020-12-31") %>%
#filter_index("2020-07-01" ~ "2020-12-31") %>%
filter(sub_region_2 == "Sevilla")-> Sev_N_cases #%>%
# ACF
Bar_N_cases %>%
ACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Barcelona - ACF Nº Cases")
Mad_N_cases %>%
ACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Madrid - ACF Nº Cases")
Mal_N_cases %>%
ACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Málaga - ACF Nº Cases")
Cad_N_cases %>%
ACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Cádiz - ACF Nº Cases")
Sev_N_cases %>%
ACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Sevilla - ACF Nº Cases")
# PACF
Bar_N_cases %>%
PACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title="Barcelona - PACF Nº Cases")
Mad_N_cases %>%
PACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Madrid - PACF Nº Cases")
Mal_N_cases %>%
PACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Málaga - PACF Nº Cases")
Cad_N_cases %>%
PACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Cádiz - PACF Nº Cases")
Sev_N_cases %>%
PACF(num_casos.x, lag_max = 30) %>%
autoplot() +
labs(title=" Sevilla - PACF Nº Cases")
#Sev_N_cases %>% gg_tsdisplay((((num_casos.x))),
# plot_type='partial', lag=36)
#Sev_N_cases %>% gg_tsdisplay(((log(num_casos.x))),
# plot_type='partial', lag=36)
#Sev_N_cases %>% gg_tsdisplay((difference(log(num_casos.x),lag=7)),
# plot_type='partial', lag=36)
#Sev_N_cases %>% gg_tsdisplay(difference(difference(log(num_casos.x),lag=7)),
# plot_type='partial', lag=36)
#ft <- Sev_N_cases %>%
# model(
# ar_mn = ARIMA(log(num_casos.x) ~ pdq(2,1,1) + PDQ(1,1,1)),
# ar_at = ARIMA(num_casos.x, stepwise = FALSE, approx = FALSE)
# )
#ft %>% select(ar_mn) %>% gg_tsresiduals(lag=36)
#ft %>% select(ar_at) %>% gg_tsresiduals(lag=36)
#augment(ft) %>% features(.innov, ljung_box, lag=36)
Box-Cox transformation (lambda is obtained using “guerrero” feature -fpp3-) and double difference is plotted (lag=7). Note: The fable package will automatically back-transform the forecasts whenever a transformation has been used in the model definition (Hyndman and Athanasopoulos 2021).
# Difference
#Bar_N_cases %>%
# gg_tsdisplay(difference(num_casos.x),
# plot_type='partial', lag_max = 30)
#Bar_N_cases %>%
# gg_tsdisplay(difference(log(num_casos.x)),
# plot_type='partial', lag_max = 30)
# Lambda values
lambda_bar <- Bar_N_cases %>%
features(num_casos.x, features = guerrero) %>%
pull(lambda_guerrero)
lambda_mad <- Mad_N_cases %>%
features(num_casos.x, features = guerrero) %>%
pull(lambda_guerrero)
lambda_mal <- Mal_N_cases %>%
features(num_casos.x, features = guerrero) %>%
pull(lambda_guerrero)
lambda_cad <- Cad_N_cases %>%
features(num_casos.x, features = guerrero) %>%
pull(lambda_guerrero)
lambda_sev <- Sev_N_cases %>%
features(num_casos.x, features = guerrero) %>%
pull(lambda_guerrero)
# Variance stb checks (variance + difference)
Bar_N_cases %>%
gg_tsdisplay(((box_cox(num_casos.x,lambda_bar))),
plot_type='partial', lag=30)+
labs(title="Barcelona - Variance stb")
Bar_N_cases %>%
gg_tsdisplay((difference(box_cox(num_casos.x,lambda_bar))),
plot_type='partial', lag=30)+
labs(title="Barcelona - Difference once")
Bar_N_cases %>%
gg_tsdisplay(difference(difference(box_cox(num_casos.x,lambda_bar))),
plot_type='partial', lag=30)+
labs(title="Barcelona - Difference double")
###########################
Mad_N_cases %>%
gg_tsdisplay(((box_cox(num_casos.x,lambda_mad))),
plot_type='partial', lag=30)+
labs(title="Madrid - Variance stb")
Mad_N_cases %>%
gg_tsdisplay((difference(box_cox(num_casos.x,lambda_mad))),
plot_type='partial', lag=30)+
labs(title="Madrid - Difference once")
Mad_N_cases %>%
gg_tsdisplay(difference(difference(box_cox(num_casos.x,lambda_mad))),
plot_type='partial', lag=30)+
labs(title="Madrid - Difference double")
###########################
Mal_N_cases %>%
gg_tsdisplay(((box_cox(num_casos.x,lambda_mal))),
plot_type='partial', lag=30)+
labs(title="Málaga - Variance stb")
Mal_N_cases %>%
gg_tsdisplay((difference(box_cox(num_casos.x,lambda_mal))),
plot_type='partial', lag=30)+
labs(title="Málaga - Difference once")
Mal_N_cases %>%
gg_tsdisplay(difference(difference(box_cox(num_casos.x,lambda_mal))),
plot_type='partial', lag=30)+
labs(title="Málaga - Difference double")
###########################
Cad_N_cases %>%
gg_tsdisplay(((box_cox(num_casos.x,lambda_cad))),
plot_type='partial',lag=30)+
labs(title="Cádiz - Variance stb")
Cad_N_cases %>%
gg_tsdisplay((difference(box_cox(num_casos.x,lambda_cad))),
plot_type='partial',lag=30)+
labs(title="Cádiz - Difference once")
Cad_N_cases %>%
gg_tsdisplay(difference(difference(box_cox(num_casos.x,lambda_cad))),
plot_type='partial',lag=30)+
labs(title="Cádiz - Difference double")
###########################
Sev_N_cases %>%
gg_tsdisplay(((box_cox(num_casos.x,lambda_sev))),
plot_type='partial', lag=30)+
labs(title="Sevilla - Variance stb")
Sev_N_cases %>%
gg_tsdisplay((difference(box_cox(num_casos.x,lambda_sev))),
plot_type='partial', lag=30)+
labs(title="Sevilla - Difference once")
Sev_N_cases %>%
gg_tsdisplay(difference(difference(box_cox(num_casos.x,lambda_sev))),
plot_type='partial', lag=30)+
labs(title="Sevilla - Difference double")
As stated by (Hyndman and Athanasopoulos 2021)… “The ARIMA() function uses unitroot_nsdiffs() to determine D (the number of seasonal differences to use), and unitroot_ndiffs() to determine d (the number of ordinary differences to use), when these are not specified.”
# Train and test ts
# Train
Bar_N_cases_tr <- Bar_N_cases %>%
#filter_index("2020-07-01" ~ "2020-12-9")
filter_index("2020-03-15" ~ "2020-12-9")
# Test
Bar_N_cases_tt <- Bar_N_cases %>%
filter_index("2020-12-10" ~ "2020-12-31")
# Model with train
fit_model <- Bar_N_cases_tr %>%
model(
SNaive = SNAIVE(num_casos.x),
arima_mn1 = ARIMA(box_cox(num_casos.x,lambda_bar) ~ pdq(2,1,2) + PDQ(1,1,1)),
arima_mn2 = ARIMA(box_cox(num_casos.x,lambda_bar) ~ pdq(1,1,2) + PDQ(0,1,2)),
arima_mn3 = ARIMA(box_cox(num_casos.x,lambda_bar) ~ pdq(1,1,2) + PDQ(1,1,2)),
arima_at1 = ARIMA(box_cox(num_casos.x,lambda_bar)),
arima_at2 = ARIMA(box_cox(num_casos.x,lambda_bar), stepwise = FALSE, approx = FALSE))
#Barcelona arima_at1 <ARIMA(0,1,1)(0,1,1)[7]>
#Barcelona arima_at2 <ARIMA(1,1,2)(0,1,1)[7]>
# Show and report model
fit_model
## # A mable: 1 x 7
## # Key: sub_region_2 [1]
## sub_region_2 SNaive arima_mn1 arima_mn2
## <chr> <model> <model> <model>
## 1 Barcelona <SNAIVE> <ARIMA(2,1,2)(1,1,1)[7]> <ARIMA(1,1,2)(0,1,2)[7]>
## # ... with 3 more variables: arima_mn3 <model>, arima_at1 <model>,
## # arima_at2 <model>
report(fit_model)
## # A tibble: 6 x 9
## sub_region_2 .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 Barcelona SNaive 148096. NA NA NA NA <NULL> <NULL>
## 2 Barcelona arima_mn1 0.171 -140. 294. 295. 319. <cpl [9~ <cpl [9]>
## 3 Barcelona arima_mn2 0.169 -139. 290. 290. 311. <cpl [1~ <cpl [16~
## 4 Barcelona arima_mn3 0.169 -139. 291. 292. 316. <cpl [8~ <cpl [16~
## 5 Barcelona arima_at1 0.170 -141. 288. 288. 299. <cpl [0~ <cpl [8]>
## 6 Barcelona arima_at2 0.168 -139. 288. 288. 305. <cpl [1~ <cpl [9]>
# Good model >> Less Sigma / AICc
fit_model %>% pivot_longer(!sub_region_2,
names_to = "Model name",
values_to = "Orders")
## # A mable: 6 x 3
## # Key: sub_region_2, Model name [6]
## sub_region_2 `Model name` Orders
## <chr> <chr> <model>
## 1 Barcelona SNaive <SNAIVE>
## 2 Barcelona arima_mn1 <ARIMA(2,1,2)(1,1,1)[7]>
## 3 Barcelona arima_mn2 <ARIMA(1,1,2)(0,1,2)[7]>
## 4 Barcelona arima_mn3 <ARIMA(1,1,2)(1,1,2)[7]>
## 5 Barcelona arima_at1 <ARIMA(0,1,1)(0,1,1)[7]>
## 6 Barcelona arima_at2 <ARIMA(1,1,2)(0,1,1)[7]>
glance(fit_model) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at2 0.168 -139. 288. 288. 305.
## 2 arima_at1 0.170 -141. 288. 288. 299.
## 3 arima_mn2 0.169 -139. 290. 290. 311.
## 4 arima_mn3 0.169 -139. 291. 292. 316.
## 5 arima_mn1 0.171 -140. 294. 295. 319.
## 6 SNaive 148096. NA NA NA NA
# We use a Ljung-Box test >> large p-value, confirmins residuals are similar to white noise.
augment(fit_model) %>%
features(.innov, ljung_box, lag=7)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 6.18 0.519
## 2 Barcelona arima_at2 4.08 0.770
## 3 Barcelona arima_mn1 4.21 0.755
## 4 Barcelona arima_mn2 4.06 0.773
## 5 Barcelona arima_mn3 4.06 0.772
## 6 Barcelona SNaive 827. 0
augment(fit_model) %>%
features(.innov, ljung_box, lag=14)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 18.2 0.198
## 2 Barcelona arima_at2 12.4 0.575
## 3 Barcelona arima_mn1 14.5 0.416
## 4 Barcelona arima_mn2 12.3 0.579
## 5 Barcelona arima_mn3 12.0 0.609
## 6 Barcelona SNaive 1009. 0
augment(fit_model) %>%
features(.innov, ljung_box, lag=21)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 39.1 0.00954
## 2 Barcelona arima_at2 32.0 0.0585
## 3 Barcelona arima_mn1 35.2 0.0271
## 4 Barcelona arima_mn2 31.9 0.0593
## 5 Barcelona arima_mn3 31.5 0.0654
## 6 Barcelona SNaive 1039. 0
fit_model %>% select(SNaive) %>% gg_tsresiduals()
fit_model %>% select(arima_mn1) %>% gg_tsresiduals()
fit_model %>% select(arima_mn2) %>% gg_tsresiduals()
fit_model %>% select(arima_mn3) %>% gg_tsresiduals()
fit_model %>% select(arima_at1) %>% gg_tsresiduals()
fit_model %>% select(arima_at2) %>% gg_tsresiduals()
# Significant spikes (at lag 6, 15, etc.) out of 30 is still consistent with white noise.
# To be sure, use a Ljung-Box test, which has a large p-value, confirming that
# the residuals are similar to white noise.
# Note that the alternative models also pass this test.
# Forecast
fc_h7<-fabletools::forecast(fit_model, h=7)
fc_h14<-fabletools::forecast(fit_model, h=14)
fc_h21<-fabletools::forecast(fit_model, h=21)
# Accuracy
fabletools::accuracy(fc_h7, Bar_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 356. 382. 356. 30.3 30.3 NaN NaN 0.453
## 2 arima_at2 Barcelona Test 346. 368. 346. 29.6 29.6 NaN NaN 0.377
## 3 arima_mn1 Barcelona Test 357. 381. 357. 30.3 30.3 NaN NaN 0.388
## 4 arima_mn2 Barcelona Test 346. 368. 346. 29.6 29.6 NaN NaN 0.380
## 5 arima_mn3 Barcelona Test 342. 363. 342. 29.3 29.3 NaN NaN 0.389
## 6 SNaive Barcelona Test 413 467. 413 35.1 35.1 NaN NaN -0.138
fabletools::accuracy(fc_h14, Bar_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 506. 547. 506. 38.1 38.1 NaN NaN 0.681
## 2 arima_at2 Barcelona Test 464. 495. 464. 35.2 35.2 NaN NaN 0.631
## 3 arima_mn1 Barcelona Test 495. 533. 495. 37.3 37.3 NaN NaN 0.655
## 4 arima_mn2 Barcelona Test 464. 495. 464. 35.2 35.2 NaN NaN 0.632
## 5 arima_mn3 Barcelona Test 460. 492. 460. 34.9 34.9 NaN NaN 0.637
## 6 SNaive Barcelona Test 554. 612. 554. 41.8 41.8 NaN NaN 0.189
fabletools::accuracy(fc_h21, Bar_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 655. 756. 655. 42.5 42.5 NaN NaN 0.598
## 2 arima_at2 Barcelona Test 575. 660. 582. 37.5 38.3 NaN NaN 0.517
## 3 arima_mn1 Barcelona Test 635. 731. 635. 41.2 41.2 NaN NaN 0.579
## 4 arima_mn2 Barcelona Test 575. 660. 582. 37.5 38.3 NaN NaN 0.517
## 5 arima_mn3 Barcelona Test 570. 655. 578. 37.2 38.0 NaN NaN 0.514
## 6 SNaive Barcelona Test 700. 806. 700. 46.0 46.0 NaN NaN 0.455
# Plots
fc_h7 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h7")
fc_h14 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h14")
fc_h21 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h21")
# Opt A
# We have added residential_percent_change variable to models
lambda_bar_b <- Bar_N_cases %>%
features(residential_percent_change_from_baseline, features = guerrero) %>%
pull(lambda_guerrero)
fit_model <- Bar_N_cases_tr %>%
model(
SNaive = SNAIVE(num_casos.x),
arima_mn1 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(residential_percent_change_from_baseline,lambda_bar_b) +
pdq(2,1,2) + PDQ(1,1,1)),
arima_mn2 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(residential_percent_change_from_baseline,lambda_bar_b) +
pdq(1,1,2) + PDQ(0,1,2)),
arima_mn3 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(residential_percent_change_from_baseline,lambda_bar_b) +
pdq(1,1,2) + PDQ(1,1,2)),
arima_at1 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(residential_percent_change_from_baseline,lambda_bar_b)),
arima_at2 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(residential_percent_change_from_baseline,lambda_bar_b),
stepwise = FALSE, approx = FALSE))
# Barcelona arima_at1 <LM w/ ARIMA(0,1,2)(0,1,1)[7] errors>
# Barcelona arima_at2 <LM w/ ARIMA(1,1,0)(0,1,1)[7] errors>
# Show and report model
fit_model
## # A mable: 1 x 7
## # Key: sub_region_2 [1]
## sub_region_2 SNaive arima_mn1
## <chr> <model> <model>
## 1 Barcelona <SNAIVE> <LM w/ ARIMA(2,1,2)(1,1,1)[7] errors>
## # ... with 4 more variables: arima_mn2 <model>, arima_mn3 <model>,
## # arima_at1 <model>, arima_at2 <model>
report(fit_model)
## # A tibble: 6 x 9
## sub_region_2 .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 Barcelona SNaive 148096. NA NA NA NA <NULL> <NULL>
## 2 Barcelona arima_mn1 0.135 -107. 231. 232. 260. <cpl [9~ <cpl [9]>
## 3 Barcelona arima_mn2 0.135 -108. 230. 230. 255. <cpl [1~ <cpl [16~
## 4 Barcelona arima_mn3 0.135 -108. 231. 232. 260. <cpl [8~ <cpl [16~
## 5 Barcelona arima_at1 0.134 -109. 227. 227. 245. <cpl [0~ <cpl [9]>
## 6 Barcelona arima_at2 0.134 -109. 226. 226. 240. <cpl [1~ <cpl [7]>
# Good model >> Less Sigma / AICc
glance(fit_model) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at2 0.134 -109. 226. 226. 240.
## 2 arima_at1 0.134 -109. 227. 227. 245.
## 3 arima_mn2 0.135 -108. 230. 230. 255.
## 4 arima_mn1 0.135 -107. 231. 232. 260.
## 5 arima_mn3 0.135 -108. 231. 232. 260.
## 6 SNaive 148096. NA NA NA NA
# We use a Ljung-Box test >> large p-value, confirmins residuals are similar to white noise.
augment(fit_model) %>%
features(.innov, ljung_box, lag=7)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 8.64 0.280
## 2 Barcelona arima_at2 9.18 0.240
## 3 Barcelona arima_mn1 7.61 0.368
## 4 Barcelona arima_mn2 7.57 0.372
## 5 Barcelona arima_mn3 7.73 0.357
## 6 Barcelona SNaive 827. 0
augment(fit_model) %>%
features(.innov, ljung_box, lag=14)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 18.5 0.185
## 2 Barcelona arima_at2 19.6 0.142
## 3 Barcelona arima_mn1 17.9 0.211
## 4 Barcelona arima_mn2 17.3 0.241
## 5 Barcelona arima_mn3 17.9 0.209
## 6 Barcelona SNaive 1009. 0
augment(fit_model) %>%
features(.innov, ljung_box, lag=21)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 43.8 0.00245
## 2 Barcelona arima_at2 45.7 0.00140
## 3 Barcelona arima_mn1 38.9 0.0101
## 4 Barcelona arima_mn2 39.9 0.00762
## 5 Barcelona arima_mn3 39.2 0.00929
## 6 Barcelona SNaive 1039. 0
fit_model %>% select(SNaive) %>% gg_tsresiduals()
fit_model %>% select(arima_mn1) %>% gg_tsresiduals()
fit_model %>% select(arima_mn2) %>% gg_tsresiduals()
fit_model %>% select(arima_mn3) %>% gg_tsresiduals()
fit_model %>% select(arima_at1) %>% gg_tsresiduals()
fit_model %>% select(arima_at2) %>% gg_tsresiduals()
# Significant spikes out of 30 is still consistent with white noise.
# To be sure, use a Ljung-Box test, which has a large p-value, confirming that the
# residuals are similar to white noise.
# Note that the alternative models also pass this test.
# New data (dynamic regression)
# Here it is needed generate future values for the exogenous variables
# Foir simplicity we select a rand number included into the 2nd and 3rd quantile for
# the variable
# h7
Bar_N_cases_fr7 <- new_data(Bar_N_cases_tr, 7) %>%
mutate(residential_percent_change_from_baseline =
runif(7,quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.75)))
# h14
Bar_N_cases_fr14 <- new_data(Bar_N_cases_tr, 14) %>%
mutate(residential_percent_change_from_baseline =
runif(14,quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.75)))
# h21
Bar_N_cases_fr21 <- new_data(Bar_N_cases_tr, 21) %>%
mutate(residential_percent_change_from_baseline =
runif(21,quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.75)))
# Forecast
fc_fh7<-fabletools::forecast(fit_model, new_data = Bar_N_cases_fr7)
fc_fh14<-fabletools::forecast(fit_model, new_data = Bar_N_cases_fr14)
fc_fh21<-fabletools::forecast(fit_model, new_data = Bar_N_cases_fr21)
# Accuracy
fabletools::accuracy(fc_fh7, Bar_N_cases)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 299. 330. 299. 25.0 25.0 1.33 0.857 0.437
## 2 arima_at2 Barcelona Test 312. 343. 312. 26.1 26.1 1.38 0.892 0.456
## 3 arima_mn1 Barcelona Test 309. 344. 309. 25.7 25.7 1.37 0.895 0.479
## 4 arima_mn2 Barcelona Test 304. 338. 304. 25.2 25.2 1.35 0.879 0.483
## 5 arima_mn3 Barcelona Test 307. 341. 307. 25.5 25.5 1.36 0.888 0.476
## 6 SNaive Barcelona Test 413 467. 413 35.1 35.1 1.83 1.21 -0.138
fabletools::accuracy(fc_fh14, Bar_N_cases)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 411. 455. 411. 30.9 30.9 1.83 1.18 0.742
## 2 arima_at2 Barcelona Test 433. 479. 433. 32.5 32.5 1.92 1.25 0.748
## 3 arima_mn1 Barcelona Test 432. 479. 432. 32.3 32.3 1.92 1.25 0.741
## 4 arima_mn2 Barcelona Test 424. 471. 424. 31.8 31.8 1.88 1.23 0.743
## 5 arima_mn3 Barcelona Test 429. 476. 429. 32.1 32.1 1.90 1.24 0.742
## 6 SNaive Barcelona Test 554. 612. 554. 41.8 41.8 2.46 1.59 0.189
fabletools::accuracy(fc_fh21, Bar_N_cases)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 520. 620. 543. 33.3 35.7 2.41 1.61 0.499
## 2 arima_at2 Barcelona Test 554. 656. 571. 35.6 37.4 2.54 1.71 0.529
## 3 arima_mn1 Barcelona Test 545. 646. 565. 34.9 37.0 2.51 1.68 0.507
## 4 arima_mn2 Barcelona Test 535. 636. 557. 34.3 36.5 2.47 1.65 0.505
## 5 arima_mn3 Barcelona Test 542. 644. 563. 34.7 36.9 2.50 1.67 0.506
## 6 SNaive Barcelona Test 700. 806. 700. 46.0 46.0 3.11 2.10 0.455
# Plots
fc_fh7 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h7")
fc_fh14 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h14")
fc_fh21 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h21")
# Model train
# Opt B
# We have added all mobility variables to models
lambda_bar_b <- Bar_N_cases %>%
features(retail_and_recreation_percent_change_from_baseline, features = guerrero) %>%
pull(lambda_guerrero)
lambda_bar_c <- Bar_N_cases %>%
features(grocery_and_pharmacy_percent_change_from_baseline, features = guerrero) %>%
pull(lambda_guerrero)
lambda_bar_d <- Bar_N_cases %>%
features(parks_percent_change_from_baseline, features = guerrero) %>%
pull(lambda_guerrero)
lambda_bar_e <- Bar_N_cases %>%
features(transit_stations_percent_change_from_baseline, features = guerrero) %>%
pull(lambda_guerrero)
lambda_bar_f <- Bar_N_cases %>%
features(workplaces_percent_change_from_baseline, features = guerrero) %>%
pull(lambda_guerrero)
lambda_bar_g <- Bar_N_cases %>%
features(residential_percent_change_from_baseline, features = guerrero) %>%
pull(lambda_guerrero)
lambda_bar_h <- Bar_N_cases %>%
features(Total, features = guerrero) %>%
pull(lambda_guerrero)
fit_model <- Bar_N_cases_tr %>%
model(
SNaive = SNAIVE(num_casos.x),
arima_mn1 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(retail_and_recreation_percent_change_from_baseline,lambda_bar_b) +
box_cox(grocery_and_pharmacy_percent_change_from_baseline,lambda_bar_c) +
box_cox(parks_percent_change_from_baseline,lambda_bar_d) +
box_cox(transit_stations_percent_change_from_baseline,lambda_bar_e) +
box_cox(workplaces_percent_change_from_baseline,lambda_bar_f) +
box_cox(residential_percent_change_from_baseline,lambda_bar_g) +
box_cox(Total,lambda_bar_h) +
pdq(2,1,2) + PDQ(1,1,1)),
arima_mn2 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(retail_and_recreation_percent_change_from_baseline,lambda_bar_b) +
box_cox(grocery_and_pharmacy_percent_change_from_baseline,lambda_bar_c) +
box_cox(parks_percent_change_from_baseline,lambda_bar_d) +
box_cox(transit_stations_percent_change_from_baseline,lambda_bar_e) +
box_cox(workplaces_percent_change_from_baseline,lambda_bar_f) +
box_cox(residential_percent_change_from_baseline,lambda_bar_g) +
box_cox(Total,lambda_bar_h) +
pdq(1,1,2) + PDQ(0,1,2)),
arima_mn3 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(retail_and_recreation_percent_change_from_baseline,lambda_bar_b) +
box_cox(grocery_and_pharmacy_percent_change_from_baseline,lambda_bar_c) +
box_cox(parks_percent_change_from_baseline,lambda_bar_d) +
box_cox(transit_stations_percent_change_from_baseline,lambda_bar_e) +
box_cox(workplaces_percent_change_from_baseline,lambda_bar_f) +
box_cox(residential_percent_change_from_baseline,lambda_bar_g) +
box_cox(Total,lambda_bar_h) +
pdq(1,1,2) + PDQ(1,1,2)),
arima_at1 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(retail_and_recreation_percent_change_from_baseline,lambda_bar_b) +
box_cox(grocery_and_pharmacy_percent_change_from_baseline,lambda_bar_c) +
box_cox(parks_percent_change_from_baseline,lambda_bar_d) +
box_cox(transit_stations_percent_change_from_baseline,lambda_bar_e) +
box_cox(workplaces_percent_change_from_baseline,lambda_bar_f) +
box_cox(residential_percent_change_from_baseline,lambda_bar_g) +
box_cox(Total,lambda_bar_h)),
arima_at2 = ARIMA(box_cox(num_casos.x,lambda_bar) ~
box_cox(retail_and_recreation_percent_change_from_baseline,lambda_bar_b) +
box_cox(grocery_and_pharmacy_percent_change_from_baseline,lambda_bar_c) +
box_cox(parks_percent_change_from_baseline,lambda_bar_d) +
box_cox(transit_stations_percent_change_from_baseline,lambda_bar_e) +
box_cox(workplaces_percent_change_from_baseline,lambda_bar_f) +
box_cox(residential_percent_change_from_baseline,lambda_bar_g) +
box_cox(Total,lambda_bar_h),
stepwise = FALSE,
approx = FALSE))
# Barcelona arima_at1 <LM w/ ARIMA(0,1,1)(1,0,0)[7] errors>
# Barcelona arima_at2 <LM w/ ARIMA(1,1,0)(2,0,0)[7] errors>
# Show and report model
fit_model
## # A mable: 1 x 7
## # Key: sub_region_2 [1]
## sub_region_2 SNaive arima_mn1
## <chr> <model> <model>
## 1 Barcelona <SNAIVE> <LM w/ ARIMA(2,1,2)(1,1,1)[7] errors>
## # ... with 4 more variables: arima_mn2 <model>, arima_mn3 <model>,
## # arima_at1 <model>, arima_at2 <model>
report(fit_model)
## # A tibble: 6 x 9
## sub_region_2 .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 Barcelona SNaive 1.48e+5 NA NA NA NA <NULL> <NULL>
## 2 Barcelona arima_mn1 1.24e-1 -93.7 215. 217. 265. <cpl [9]> <cpl [9]>
## 3 Barcelona arima_mn2 1.27e-1 -97.7 221. 223. 268. <cpl [1]> <cpl [16~
## 4 Barcelona arima_mn3 1.28e-1 -97.8 224. 225. 273. <cpl [8]> <cpl [16~
## 5 Barcelona arima_at1 1.65e-1 -135. 291. 292. 327. <cpl [7]> <cpl [1]>
## 6 Barcelona arima_at2 1.39e-1 -116. 254. 255. 293. <cpl [15~ <cpl [0]>
# Good model >> Less Sigma / AICc
fit_model %>% pivot_longer(!sub_region_2,
names_to = "Model name",
values_to = "Orders")
## # A mable: 6 x 3
## # Key: sub_region_2, Model name [6]
## sub_region_2 `Model name` Orders
## <chr> <chr> <model>
## 1 Barcelona SNaive <SNAIVE>
## 2 Barcelona arima_mn1 <LM w/ ARIMA(2,1,2)(1,1,1)[7] errors>
## 3 Barcelona arima_mn2 <LM w/ ARIMA(1,1,2)(0,1,2)[7] errors>
## 4 Barcelona arima_mn3 <LM w/ ARIMA(1,1,2)(1,1,2)[7] errors>
## 5 Barcelona arima_at1 <LM w/ ARIMA(0,1,1)(1,0,0)[7] errors>
## 6 Barcelona arima_at2 <LM w/ ARIMA(1,1,0)(2,0,0)[7] errors>
glance(fit_model) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_mn1 0.124 -93.7 215. 217. 265.
## 2 arima_mn2 0.127 -97.7 221. 223. 268.
## 3 arima_mn3 0.128 -97.8 224. 225. 273.
## 4 arima_at2 0.139 -116. 254. 255. 293.
## 5 arima_at1 0.165 -135. 291. 292. 327.
## 6 SNaive 148096. NA NA NA NA
# We use a Ljung-Box test >> large p-value, confirmins residuals are similar to white noise.
augment(fit_model) %>%
features(.innov, ljung_box, lag=7)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 11.7 0.112
## 2 Barcelona arima_at2 11.2 0.129
## 3 Barcelona arima_mn1 6.81 0.449
## 4 Barcelona arima_mn2 7.41 0.388
## 5 Barcelona arima_mn3 7.52 0.377
## 6 Barcelona SNaive 827. 0
augment(fit_model) %>%
features(.innov, ljung_box, lag=14)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 23.7 0.0496
## 2 Barcelona arima_at2 27.2 0.0179
## 3 Barcelona arima_mn1 14.0 0.450
## 4 Barcelona arima_mn2 17.8 0.218
## 5 Barcelona arima_mn3 17.9 0.209
## 6 Barcelona SNaive 1009. 0
augment(fit_model) %>%
features(.innov, ljung_box, lag=21)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Barcelona arima_at1 54.2 0.0000921
## 2 Barcelona arima_at2 60.2 0.0000119
## 3 Barcelona arima_mn1 48.5 0.000589
## 4 Barcelona arima_mn2 45.8 0.00135
## 5 Barcelona arima_mn3 46.2 0.00119
## 6 Barcelona SNaive 1039. 0
fit_model %>% select(SNaive) %>% gg_tsresiduals()
fit_model %>% select(arima_mn1) %>% gg_tsresiduals()
fit_model %>% select(arima_mn2) %>% gg_tsresiduals()
fit_model %>% select(arima_mn3) %>% gg_tsresiduals()
fit_model %>% select(arima_at1) %>% gg_tsresiduals()
fit_model %>% select(arima_at2) %>% gg_tsresiduals()
# Significant spikes out of 30 is still consistent with white noise.
# To be sure, use a Ljung-Box test, which has a large p-value, confirming that the
# residuals are similar to white noise.
# Note that the alternative models also pass this test.
# New data (dynamic regression)
# Here it is needed generate future values for the exogenous variables
# Foir simplicity we select a rand number included into the 2nd and 3rd quantile for
# the variable
# h7
Bar_N_cases_fr7 <- new_data(Bar_N_cases_tr, 7) %>%
mutate(retail_and_recreation_percent_change_from_baseline =
runif(7,quantile(Bar_N_cases_tt$retail_and_recreation_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$retail_and_recreation_percent_change_from_baseline,
0.75)),
grocery_and_pharmacy_percent_change_from_baseline =
runif(7,quantile(Bar_N_cases_tt$grocery_and_pharmacy_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$grocery_and_pharmacy_percent_change_from_baseline,
0.75)),
parks_percent_change_from_baseline =
runif(7,quantile(Bar_N_cases_tr$parks_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$parks_percent_change_from_baseline,
0.75)),
transit_stations_percent_change_from_baseline =
runif(7,quantile(Bar_N_cases_tt$transit_stations_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$transit_stations_percent_change_from_baseline,
0.75)),
workplaces_percent_change_from_baseline =
runif(7,quantile(Bar_N_cases_tt$workplaces_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$workplaces_percent_change_from_baseline,
0.75)),
residential_percent_change_from_baseline =
runif(7,quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.75)),
Total = runif(7,quantile(Bar_N_cases_tt$Total,0.25),
quantile(Bar_N_cases_tt$Total,0.75)))
# h14
Bar_N_cases_fr14 <- new_data(Bar_N_cases_tr, 14) %>%
mutate(retail_and_recreation_percent_change_from_baseline =
runif(14,quantile(Bar_N_cases_tt$retail_and_recreation_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$retail_and_recreation_percent_change_from_baseline,
0.75)),
grocery_and_pharmacy_percent_change_from_baseline =
runif(14,quantile(Bar_N_cases_tt$grocery_and_pharmacy_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$grocery_and_pharmacy_percent_change_from_baseline,
0.75)),
parks_percent_change_from_baseline =
runif(14,quantile(Bar_N_cases_tt$parks_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$parks_percent_change_from_baseline,
0.75)),
transit_stations_percent_change_from_baseline =
runif(14,quantile(Bar_N_cases_tt$transit_stations_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$transit_stations_percent_change_from_baseline,
0.75)),
workplaces_percent_change_from_baseline =
runif(14,quantile(Bar_N_cases_tt$workplaces_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$workplaces_percent_change_from_baseline,
0.75)),
residential_percent_change_from_baseline =
runif(14,quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.75)),
Total = runif(14,quantile(Bar_N_cases_tt$Total,0.25),
quantile(Bar_N_cases_tt$Total,0.75)))
# h21
Bar_N_cases_fr21 <- new_data(Bar_N_cases_tr, 21) %>%
mutate(retail_and_recreation_percent_change_from_baseline =
runif(21,quantile(Bar_N_cases_tt$retail_and_recreation_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$retail_and_recreation_percent_change_from_baseline,
0.75)),
grocery_and_pharmacy_percent_change_from_baseline =
runif(21,quantile(Bar_N_cases_tt$grocery_and_pharmacy_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$grocery_and_pharmacy_percent_change_from_baseline,
0.75)),
parks_percent_change_from_baseline =
runif(21,quantile(Bar_N_cases_tt$parks_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$parks_percent_change_from_baseline,
0.75)),
transit_stations_percent_change_from_baseline =
runif(21,quantile(Bar_N_cases_tt$transit_stations_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$transit_stations_percent_change_from_baseline,
0.75)),
workplaces_percent_change_from_baseline =
runif(21,quantile(Bar_N_cases_tt$workplaces_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$workplaces_percent_change_from_baseline,
0.75)),
residential_percent_change_from_baseline =
runif(21,quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.25),
quantile(Bar_N_cases_tt$residential_percent_change_from_baseline,
0.75)),
Total = runif(21,quantile(Bar_N_cases_tt$Total,0.25),
quantile(Bar_N_cases_tt$Total,0.75)))
# Forecast
fc_fh7<-fabletools::forecast(fit_model, new_data = Bar_N_cases_fr7)
fc_fh14<-fabletools::forecast(fit_model, new_data = Bar_N_cases_fr14)
fc_fh21<-fabletools::forecast(fit_model, new_data = Bar_N_cases_fr21)
# Accuracy
fabletools::accuracy(fc_fh7, Bar_N_cases)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 180. 315. 264. 10.3 22.2 1.17 0.820 0.175
## 2 arima_at2 Barcelona Test 149. 178. 149. 12.0 12.0 0.661 0.463 0.107
## 3 arima_mn1 Barcelona Test 223. 246. 223. 19.2 19.2 0.992 0.641 0.335
## 4 arima_mn2 Barcelona Test 236. 256. 236. 20.3 20.3 1.05 0.666 0.398
## 5 arima_mn3 Barcelona Test 235. 255. 235. 20.3 20.3 1.04 0.663 0.384
## 6 SNaive Barcelona Test 413 467. 413 35.1 35.1 1.83 1.21 -0.138
fabletools::accuracy(fc_fh14, Bar_N_cases)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 230. 369. 312. 12.7 23.1 1.39 0.959 0.0956
## 2 arima_at2 Barcelona Test 216. 253. 216. 15.9 15.9 0.960 0.659 0.223
## 3 arima_mn1 Barcelona Test 373. 420. 373. 28.0 28.0 1.66 1.09 0.681
## 4 arima_mn2 Barcelona Test 368. 409. 368. 27.7 27.7 1.63 1.06 0.684
## 5 arima_mn3 Barcelona Test 365. 405. 365. 27.5 27.5 1.62 1.05 0.677
## 6 SNaive Barcelona Test 554. 612. 554. 41.8 41.8 2.46 1.59 0.189
fabletools::accuracy(fc_fh21, Bar_N_cases)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Barcelona Test 282. 486. 380. 13.2 24.0 1.69 1.26 0.306
## 2 arima_at2 Barcelona Test 208. 306. 261. 12.5 17.9 1.16 0.797 0.125
## 3 arima_mn1 Barcelona Test 492. 591. 504. 31.6 32.8 2.24 1.54 0.546
## 4 arima_mn2 Barcelona Test 491. 590. 504. 31.3 32.8 2.24 1.54 0.539
## 5 arima_mn3 Barcelona Test 487. 586. 501. 31.1 32.6 2.22 1.52 0.538
## 6 SNaive Barcelona Test 700. 806. 700. 46.0 46.0 3.11 2.10 0.455
# Plots
fc_fh7 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h7")
fc_fh14 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h14")
fc_fh21 %>%
#filter(.model=='arima_at1'|.model=='arima_at2'|.model=='SNaive') %>%
autoplot(Bar_N_cases_tt) +
labs(title="Barcelona - forecast h21")
# Train and test ts
# Train
Mad_N_cases_tr <- Mad_N_cases %>%
filter_index("2020-03-15" ~ "2020-12-9")
Mal_N_cases_tr <- Mal_N_cases %>%
filter_index("2020-03-15" ~ "2020-12-9")
Cad_N_cases_tr <- Cad_N_cases %>%
filter_index("2020-03-15" ~ "2020-12-9")
Sev_N_cases_tr <- Sev_N_cases %>%
filter_index("2020-03-15" ~ "2020-12-9")
# Test
Mad_N_cases_tt <- Mad_N_cases %>%
filter_index("2020-12-10" ~ "2020-12-31")
Mal_N_cases_tt <- Mal_N_cases %>%
filter_index("2020-12-10" ~ "2020-12-31")
Cad_N_cases_tt <- Cad_N_cases %>%
filter_index("2020-12-10" ~ "2020-12-31")
Sev_N_cases_tt <- Sev_N_cases %>%
filter_index("2020-12-10" ~ "2020-12-31")
####### Madrid #######
# Model train
Mad_fit_model <- Mad_N_cases_tr %>%
model(
SNaive = SNAIVE(num_casos.x),
arima_mn1 = ARIMA(box_cox(num_casos.x,lambda_mad) ~ pdq(1,1,2) + PDQ(0,1,1)),
arima_mn2 = ARIMA(box_cox(num_casos.x,lambda_mad) ~ pdq(2,1,2) + PDQ(0,1,1)),
arima_mn3 = ARIMA(box_cox(num_casos.x,lambda_mad) ~ pdq(2,1,2) + PDQ(0,1,0)),
arima_at1 = ARIMA(box_cox(num_casos.x,lambda_mad)),
arima_at2 = ARIMA(box_cox(num_casos.x,lambda_mad), stepwise = FALSE,approx = FALSE))
# Madrid arima_at1 <ARIMA(0,1,2)(0,1,1)[7]>
# Madrid arima_at2 <ARIMA(0,1,2)(0,1,1)[7]>
Mad_fit_model %>% pivot_longer(!sub_region_2,
names_to = "Model name",
values_to = "Orders")
## # A mable: 6 x 3
## # Key: sub_region_2, Model name [6]
## sub_region_2 `Model name` Orders
## <chr> <chr> <model>
## 1 Madrid SNaive <SNAIVE>
## 2 Madrid arima_mn1 <ARIMA(1,1,2)(0,1,1)[7]>
## 3 Madrid arima_mn2 <ARIMA(2,1,2)(0,1,1)[7]>
## 4 Madrid arima_mn3 <ARIMA(2,1,2)(0,1,0)[7]>
## 5 Madrid arima_at1 <ARIMA(0,1,2)(0,1,1)[7]>
## 6 Madrid arima_at2 <ARIMA(0,1,2)(0,1,1)[7]>
# Good model >> Less Sigma / More BIC or AIC
glance(Mad_fit_model) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 0.0750 -32.4 72.9 73.1 87.2
## 2 arima_at2 0.0750 -32.4 72.9 73.1 87.2
## 3 arima_mn1 0.0751 -32.2 74.4 74.6 92.2
## 4 arima_mn2 0.0748 -31.2 74.4 74.7 95.8
## 5 arima_mn3 0.100 -68.1 146. 146. 164.
## 6 SNaive 205618. NA NA NA NA
Mad_fit_model %>% select(SNaive) %>% gg_tsresiduals()
Mad_fit_model %>% select(arima_mn1) %>% gg_tsresiduals()
Mad_fit_model %>% select(arima_mn2) %>% gg_tsresiduals()
Mad_fit_model %>% select(arima_mn3) %>% gg_tsresiduals()
Mad_fit_model %>% select(arima_at1) %>% gg_tsresiduals()
Mad_fit_model %>% select(arima_at2) %>% gg_tsresiduals()
augment(Mad_fit_model) %>%
features(.innov, ljung_box, lag=7)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Madrid arima_at1 7.21 4.07e- 1
## 2 Madrid arima_at2 7.21 4.07e- 1
## 3 Madrid arima_mn1 6.43 4.90e- 1
## 4 Madrid arima_mn2 7.32 3.96e- 1
## 5 Madrid arima_mn3 56.8 6.50e-10
## 6 Madrid SNaive 626. 0.
augment(Mad_fit_model) %>%
features(.innov, ljung_box, lag=14)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Madrid arima_at1 15.0 0.377
## 2 Madrid arima_at2 15.0 0.377
## 3 Madrid arima_mn1 14.3 0.427
## 4 Madrid arima_mn2 13.8 0.467
## 5 Madrid arima_mn3 67.6 0.00000000515
## 6 Madrid SNaive 918. 0
augment(Mad_fit_model) %>%
features(.innov, ljung_box, lag=21)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Madrid arima_at1 16.7 0.731
## 2 Madrid arima_at2 16.7 0.731
## 3 Madrid arima_mn1 16.1 0.766
## 4 Madrid arima_mn2 15.2 0.812
## 5 Madrid arima_mn3 72.6 0.000000132
## 6 Madrid SNaive 1016. 0
# Forecast
Mad_fc_h7<-fabletools::forecast(Mad_fit_model, h=7)
Mad_fc_h14<-fabletools::forecast(Mad_fit_model, h=14)
Mad_fc_h21<-fabletools::forecast(Mad_fit_model, h=21)
# Accuracy
fabletools::accuracy(Mad_fc_h7, Mad_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Madrid Test 525. 545. 525. 29.3 29.3 NaN NaN -0.0803
## 2 arima_at2 Madrid Test 525. 545. 525. 29.3 29.3 NaN NaN -0.0803
## 3 arima_mn1 Madrid Test 518. 538. 518. 28.9 28.9 NaN NaN -0.0931
## 4 arima_mn2 Madrid Test 522. 542. 522. 29.2 29.2 NaN NaN -0.0772
## 5 arima_mn3 Madrid Test 532. 573. 532. 29.9 29.9 NaN NaN -0.303
## 6 SNaive Madrid Test 684. 716. 684. 38.6 38.6 NaN NaN -0.239
fabletools::accuracy(Mad_fc_h14, Mad_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Madrid Test 507. 522. 507. 28.2 28.2 NaN NaN -0.119
## 2 arima_at2 Madrid Test 507. 522. 507. 28.2 28.2 NaN NaN -0.119
## 3 arima_mn1 Madrid Test 495. 510. 495. 27.5 27.5 NaN NaN -0.114
## 4 arima_mn2 Madrid Test 506. 521. 506. 28.1 28.1 NaN NaN -0.113
## 5 arima_mn3 Madrid Test 451. 498. 451. 25.3 25.3 NaN NaN -0.0389
## 6 SNaive Madrid Test 707. 732. 707. 39.6 39.6 NaN NaN -0.215
fabletools::accuracy(Mad_fc_h21, Mad_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Madrid Test 564. 695. 601. 26.7 29.6 NaN NaN 0.522
## 2 arima_at2 Madrid Test 564. 695. 601. 26.7 29.6 NaN NaN 0.522
## 3 arima_mn1 Madrid Test 545. 677. 587. 25.7 29.0 NaN NaN 0.520
## 4 arima_mn2 Madrid Test 563. 695. 600. 26.6 29.6 NaN NaN 0.526
## 5 arima_mn3 Madrid Test 413. 557. 502. 19.5 26.4 NaN NaN 0.504
## 6 SNaive Madrid Test 825. 938. 825. 41.2 41.2 NaN NaN 0.553
# Plots
Mad_fc_h7 %>%
autoplot(Mad_N_cases_tt) +
labs(title="Madrid - forecast h7")
Mad_fc_h14 %>%
autoplot(Mad_N_cases_tt) +
labs(title="Madrid - forecast h14")
Mad_fc_h21 %>%
autoplot(Mad_N_cases_tt) +
labs(title="Madrid - forecast h21")
####### Málaga #######
# Model train
Mal_fit_model <- Mal_N_cases_tr %>%
model(
SNaive = SNAIVE(num_casos.x),
arima_mn1 = ARIMA(box_cox(num_casos.x,lambda_mal) ~ pdq(1,1,1) + PDQ(1,0,1)),
arima_mn2 = ARIMA(box_cox(num_casos.x,lambda_mal) ~ pdq(1,1,1) + PDQ(1,1,1)),
arima_mn3 = ARIMA(box_cox(num_casos.x,lambda_mal) ~ pdq(0,1,1) + PDQ(0,1,1)),
arima_at1 = ARIMA(box_cox(num_casos.x,lambda_mal)),
arima_at2 = ARIMA(box_cox(num_casos.x,lambda_mal), stepwise = FALSE,approx = FALSE))
# Málaga arima_at1 <ARIMA(0,1,1)(1,0,1)[7]>
# Málaga arima_at2 <ARIMA(0,1,1)(1,0,1)[7]>
Mal_fit_model %>% pivot_longer(!sub_region_2,
names_to = "Model name",
values_to = "Orders")
## # A mable: 6 x 3
## # Key: sub_region_2, Model name [6]
## sub_region_2 `Model name` Orders
## <chr> <chr> <model>
## 1 Málaga SNaive <SNAIVE>
## 2 Málaga arima_mn1 <ARIMA(1,1,1)(1,0,1)[7]>
## 3 Málaga arima_mn2 <ARIMA(1,1,1)(1,1,1)[7]>
## 4 Málaga arima_mn3 <ARIMA(0,1,1)(0,1,1)[7]>
## 5 Málaga arima_at1 <ARIMA(0,1,1)(1,0,1)[7]>
## 6 Málaga arima_at2 <ARIMA(0,1,1)(1,0,1)[7]>
# Good model >> Less Sigma / More BIC or AIC
glance(Mal_fit_model) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_mn3 1.52 -427. 861. 861. 871.
## 2 arima_mn2 1.51 -426. 862. 862. 879.
## 3 arima_at1 1.47 -432. 872. 872. 886.
## 4 arima_at2 1.47 -432. 872. 872. 886.
## 5 arima_mn1 1.47 -431. 872. 873. 890.
## 6 SNaive 2991. NA NA NA NA
Mal_fit_model %>% select(SNaive) %>% gg_tsresiduals()
Mal_fit_model %>% select(arima_mn1) %>% gg_tsresiduals()
Mal_fit_model %>% select(arima_mn2) %>% gg_tsresiduals()
Mal_fit_model %>% select(arima_mn3) %>% gg_tsresiduals()
Mal_fit_model %>% select(arima_at1) %>% gg_tsresiduals()
Mal_fit_model %>% select(arima_at2) %>% gg_tsresiduals()
augment(Mal_fit_model) %>%
features(.innov, ljung_box, lag=7)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Málaga arima_at1 14.4 0.0445
## 2 Málaga arima_at2 14.4 0.0445
## 3 Málaga arima_mn1 12.4 0.0882
## 4 Málaga arima_mn2 12.0 0.0990
## 5 Málaga arima_mn3 14.9 0.0369
## 6 Málaga SNaive 521. 0
augment(Mal_fit_model) %>%
features(.innov, ljung_box, lag=14)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Málaga arima_at1 18.0 0.206
## 2 Málaga arima_at2 18.0 0.206
## 3 Málaga arima_mn1 15.6 0.338
## 4 Málaga arima_mn2 15.9 0.317
## 5 Málaga arima_mn3 20.5 0.114
## 6 Málaga SNaive 693. 0
augment(Mal_fit_model) %>%
features(.innov, ljung_box, lag=21)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Málaga arima_at1 25.3 0.233
## 2 Málaga arima_at2 25.3 0.233
## 3 Málaga arima_mn1 23.4 0.321
## 4 Málaga arima_mn2 26.4 0.191
## 5 Málaga arima_mn3 32.3 0.0550
## 6 Málaga SNaive 711. 0
# Forecast
Mal_fc_h7<-fabletools::forecast(Mal_fit_model, h=7)
Mal_fc_h14<-fabletools::forecast(Mal_fit_model, h=14)
Mal_fc_h21<-fabletools::forecast(Mal_fit_model, h=21)
# Accuracy
fabletools::accuracy(Mal_fc_h7, Mal_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Málaga Test 74.9 77.2 74.9 43.5 43.5 NaN NaN -0.0846
## 2 arima_at2 Málaga Test 74.9 77.2 74.9 43.5 43.5 NaN NaN -0.0846
## 3 arima_mn1 Málaga Test 73.5 75.8 73.5 42.6 42.6 NaN NaN -0.0765
## 4 arima_mn2 Málaga Test 73.1 75.4 73.1 42.7 42.7 NaN NaN -0.226
## 5 arima_mn3 Málaga Test 76.7 79.0 76.7 44.9 44.9 NaN NaN -0.171
## 6 SNaive Málaga Test 48.3 57.7 48.3 27.9 27.9 NaN NaN 0.412
fabletools::accuracy(Mal_fc_h14, Mal_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Málaga Test 81.4 86.1 81.4 46.4 46.4 NaN NaN 0.320
## 2 arima_at2 Málaga Test 81.4 86.1 81.4 46.4 46.4 NaN NaN 0.320
## 3 arima_mn1 Málaga Test 79.9 84.7 79.9 45.5 45.5 NaN NaN 0.321
## 4 arima_mn2 Málaga Test 77.9 81.6 77.9 44.9 44.9 NaN NaN 0.299
## 5 arima_mn3 Málaga Test 83.8 87.9 83.8 48.5 48.5 NaN NaN 0.339
## 6 SNaive Málaga Test 49 63.3 49.4 26.9 27.2 NaN NaN 0.516
fabletools::accuracy(Mal_fc_h21, Mal_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Málaga Test 117. 141. 117. 53.1 53.1 NaN NaN 0.648
## 2 arima_at2 Málaga Test 117. 141. 117. 53.1 53.1 NaN NaN 0.648
## 3 arima_mn1 Málaga Test 116. 140. 116. 52.3 52.3 NaN NaN 0.648
## 4 arima_mn2 Málaga Test 112. 135. 112. 51.1 51.1 NaN NaN 0.655
## 5 arima_mn3 Málaga Test 121. 144. 121. 55.3 55.3 NaN NaN 0.664
## 6 SNaive Málaga Test 80.8 118. 83.4 33.0 34.6 NaN NaN 0.637
# Plots
Mal_fc_h7 %>%
autoplot(Mal_N_cases_tt) +
labs(title="Málaga - forecast h7")
Mal_fc_h14 %>%
autoplot(Mal_N_cases_tt) +
labs(title="Málaga - forecast h14")
Mal_fc_h21 %>%
autoplot(Mal_N_cases_tt) +
labs(title="Málaga - forecast h21")
####### Cádiz #######
# Model train
Cad_fit_model <- Cad_N_cases_tr %>%
model(
SNaive = SNAIVE(num_casos.x),
arima_mn1 = ARIMA(box_cox(num_casos.x,lambda_cad) ~ pdq(1,1,1) + PDQ(1,1,1)),
arima_mn2 = ARIMA(box_cox(num_casos.x,lambda_cad) ~ pdq(1,1,2) + PDQ(1,1,1)),
arima_mn3 = ARIMA(box_cox(num_casos.x,lambda_cad) ~ pdq(2,1,1) + PDQ(1,1,1)),
arima_at1 = ARIMA(box_cox(num_casos.x,lambda_cad)),
arima_at2 = ARIMA(box_cox(num_casos.x,lambda_cad), stepwise = FALSE,approx = FALSE))
# Cádiz arima_at1 <ARIMA(0,1,1)(1,0,1)[7]>
# Cádiz arima_at2 <ARIMA(4,1,0)(1,0,1)[7]>
Cad_fit_model %>% pivot_longer(!sub_region_2,
names_to = "Model name",
values_to = "Orders")
## # A mable: 6 x 3
## # Key: sub_region_2, Model name [6]
## sub_region_2 `Model name` Orders
## <chr> <chr> <model>
## 1 Cádiz SNaive <SNAIVE>
## 2 Cádiz arima_mn1 <ARIMA(1,1,1)(1,1,1)[7]>
## 3 Cádiz arima_mn2 <ARIMA(1,1,2)(1,1,1)[7]>
## 4 Cádiz arima_mn3 <ARIMA(2,1,1)(1,1,1)[7]>
## 5 Cádiz arima_at1 <ARIMA(0,1,1)(1,0,1)[7]>
## 6 Cádiz arima_at2 <ARIMA(4,1,0)(1,0,1)[7]>
# Good model >> Less Sigma / AICc
glance(Cad_fit_model) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_mn1 2.25 -479. 968. 968. 985.
## 2 arima_mn2 2.26 -478. 969. 969. 990.
## 3 arima_mn3 2.26 -479. 969. 970. 991.
## 4 arima_at2 2.13 -480. 975. 975. 1000.
## 5 arima_at1 2.19 -485. 978. 978. 993.
## 6 SNaive 2562. NA NA NA NA
Cad_fit_model %>% select(SNaive) %>% gg_tsresiduals()
Cad_fit_model %>% select(arima_mn1) %>% gg_tsresiduals()
Cad_fit_model %>% select(arima_mn2) %>% gg_tsresiduals()
Cad_fit_model %>% select(arima_mn3) %>% gg_tsresiduals()
Cad_fit_model %>% select(arima_at1) %>% gg_tsresiduals()
Cad_fit_model %>% select(arima_at2) %>% gg_tsresiduals()
augment(Cad_fit_model) %>%
features(.innov, ljung_box, lag=7)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Cádiz arima_at1 15.8 0.0270
## 2 Cádiz arima_at2 6.55 0.478
## 3 Cádiz arima_mn1 16.5 0.0209
## 4 Cádiz arima_mn2 15.1 0.0347
## 5 Cádiz arima_mn3 16.0 0.0249
## 6 Cádiz SNaive 710. 0
augment(Cad_fit_model) %>%
features(.innov, ljung_box, lag=14)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Cádiz arima_at1 28.8 0.0112
## 2 Cádiz arima_at2 23.6 0.0512
## 3 Cádiz arima_mn1 30.4 0.00665
## 4 Cádiz arima_mn2 29.1 0.0103
## 5 Cádiz arima_mn3 30.1 0.00741
## 6 Cádiz SNaive 899. 0
augment(Cad_fit_model) %>%
features(.innov, ljung_box, lag=21)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Cádiz arima_at1 38.9 0.0101
## 2 Cádiz arima_at2 33.7 0.0393
## 3 Cádiz arima_mn1 41.2 0.00527
## 4 Cádiz arima_mn2 39.4 0.00881
## 5 Cádiz arima_mn3 41.0 0.00556
## 6 Cádiz SNaive 911. 0
# Forecast
Cad_fc_h7<-fabletools::forecast(Cad_fit_model, h=7)
Cad_fc_h14<-fabletools::forecast(Cad_fit_model, h=14)
Cad_fc_h21<-fabletools::forecast(Cad_fit_model, h=21)
# Accuracy
fabletools::accuracy(Cad_fc_h7, Cad_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_a~ Cádiz Test 28.6 31.8 28.6 16.9 16.9 NaN NaN 0.155
## 2 arima_a~ Cádiz Test 30.2 32.8 30.2 18.1 18.1 NaN NaN 0.213
## 3 arima_m~ Cádiz Test 22.4 26.2 22.9 13.5 13.9 NaN NaN -0.00680
## 4 arima_m~ Cádiz Test 22.9 26.8 23.2 13.8 14.0 NaN NaN -0.0266
## 5 arima_m~ Cádiz Test 23.2 26.8 23.4 14.1 14.2 NaN NaN -0.0268
## 6 SNaive Cádiz Test 3.57 20.6 17.6 2.68 10.8 NaN NaN -0.166
fabletools::accuracy(Cad_fc_h14, Cad_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Cádiz Test 41.8 47.9 41.8 23.8 23.8 NaN NaN 0.462
## 2 arima_at2 Cádiz Test 42.6 48.2 42.6 24.4 24.4 NaN NaN 0.459
## 3 arima_mn1 Cádiz Test 31.4 37.3 31.6 18.0 18.2 NaN NaN 0.433
## 4 arima_mn2 Cádiz Test 32.0 37.9 32.1 18.4 18.5 NaN NaN 0.425
## 5 arima_mn3 Cádiz Test 32.3 38.0 32.4 18.6 18.7 NaN NaN 0.432
## 6 SNaive Cádiz Test 10.1 34.3 28.8 5.48 16.5 NaN NaN 0.583
fabletools::accuracy(Cad_fc_h21, Cad_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Cádiz Test 78.5 104. 78.5 34.2 34.2 NaN NaN 0.817
## 2 arima_at2 Cádiz Test 78.6 104. 78.6 34.4 34.4 NaN NaN 0.815
## 3 arima_mn1 Cádiz Test 64.3 91.6 64.4 27.3 27.4 NaN NaN 0.798
## 4 arima_mn2 Cádiz Test 65.0 92.2 65.1 27.6 27.7 NaN NaN 0.797
## 5 arima_mn3 Cádiz Test 65.1 92.1 65.2 27.8 27.8 NaN NaN 0.797
## 6 SNaive Cádiz Test 41.4 86.5 58.1 14.6 24.4 NaN NaN 0.758
# Plots
Cad_fc_h7 %>%
autoplot(Cad_N_cases_tt) +
labs(title="Cádiz - forecast h7")
Cad_fc_h14 %>%
autoplot(Cad_N_cases_tt) +
labs(title="Cádiz - forecast h14")
Cad_fc_h21 %>%
autoplot(Cad_N_cases_tt) +
labs(title="Cádiz - forecast h21")
####### Sevilla #######
# Model train
Sev_fit_model <- Sev_N_cases_tr %>%
model(
SNaive = SNAIVE(num_casos.x),
arima_mn1 = ARIMA(box_cox(num_casos.x,lambda_sev) ~ pdq(1,1,1) + PDQ(1,1,1)),
arima_mn2 = ARIMA(box_cox(num_casos.x,lambda_sev) ~ pdq(1,1,2) + PDQ(1,0,1)),
arima_mn3 = ARIMA(box_cox(num_casos.x,lambda_sev) ~ pdq(1,1,1) + PDQ(1,0,1)),
arima_at1 = ARIMA(box_cox(num_casos.x,lambda_sev)),
arima_at2 = ARIMA(box_cox(num_casos.x,lambda_sev), stepwise = FALSE,approx = FALSE))
# Sevilla arima_at1 <ARIMA(1,1,1)(1,0,1)[7]>
# Sevilla arima_at2 <ARIMA(0,1,1)(1,0,1)[7]>
Sev_fit_model %>% pivot_longer(!sub_region_2,
names_to = "Model name",
values_to = "Orders")
## # A mable: 6 x 3
## # Key: sub_region_2, Model name [6]
## sub_region_2 `Model name` Orders
## <chr> <chr> <model>
## 1 Sevilla SNaive <SNAIVE>
## 2 Sevilla arima_mn1 <ARIMA(1,1,1)(1,1,1)[7]>
## 3 Sevilla arima_mn2 <ARIMA(1,1,2)(1,0,1)[7]>
## 4 Sevilla arima_mn3 <ARIMA(1,1,1)(1,0,1)[7]>
## 5 Sevilla arima_at1 <ARIMA(1,1,1)(1,0,1)[7]>
## 6 Sevilla arima_at2 <ARIMA(0,1,1)(1,0,1)[7]>
# Good model >> Less Sigma / AICc
glance(Sev_fit_model) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 x 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_mn1 2.16 -472. 954. 954. 972.
## 2 arima_at2 2.08 -479. 966. 966. 980.
## 3 arima_mn3 2.08 -478. 967. 967. 985.
## 4 arima_at1 2.08 -478. 967. 967. 985.
## 5 arima_mn2 2.07 -478. 967. 967. 989.
## 6 SNaive 14589. NA NA NA NA
Sev_fit_model %>% select(SNaive) %>% gg_tsresiduals()
Sev_fit_model %>% select(arima_mn1) %>% gg_tsresiduals()
Sev_fit_model %>% select(arima_mn2) %>% gg_tsresiduals()
Sev_fit_model %>% select(arima_mn3) %>% gg_tsresiduals()
Sev_fit_model %>% select(arima_at1) %>% gg_tsresiduals()
Sev_fit_model %>% select(arima_at2) %>% gg_tsresiduals()
augment(Sev_fit_model) %>%
features(.innov, ljung_box, lag=7, dof=3)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Sevilla arima_at1 2.65 0.617
## 2 Sevilla arima_at2 3.93 0.416
## 3 Sevilla arima_mn1 2.12 0.715
## 4 Sevilla arima_mn2 0.538 0.970
## 5 Sevilla arima_mn3 2.65 0.617
## 6 Sevilla SNaive 591. 0
augment(Sev_fit_model) %>%
features(.innov, ljung_box, lag=14, dof=3)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Sevilla arima_at1 6.87 0.809
## 2 Sevilla arima_at2 8.14 0.701
## 3 Sevilla arima_mn1 8.00 0.714
## 4 Sevilla arima_mn2 5.68 0.894
## 5 Sevilla arima_mn3 6.87 0.809
## 6 Sevilla SNaive 743. 0
augment(Sev_fit_model) %>%
features(.innov, ljung_box, lag=21, dof=3)
## # A tibble: 6 x 4
## sub_region_2 .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Sevilla arima_at1 15.1 0.652
## 2 Sevilla arima_at2 16.0 0.594
## 3 Sevilla arima_mn1 15.9 0.597
## 4 Sevilla arima_mn2 13.9 0.736
## 5 Sevilla arima_mn3 15.1 0.652
## 6 Sevilla SNaive 815. 0
# Forecast
Sev_fc_h7<-fabletools::forecast(Sev_fit_model, h=7)
Sev_fc_h14<-fabletools::forecast(Sev_fit_model, h=14)
Sev_fc_h21<-fabletools::forecast(Sev_fit_model, h=21)
# Accuracy
fabletools::accuracy(Sev_fc_h7, Sev_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Sevilla Test 61.7 65.2 61.7 37.9 37.9 NaN NaN -0.297
## 2 arima_at2 Sevilla Test 60.4 64.1 60.4 37.1 37.1 NaN NaN -0.305
## 3 arima_mn1 Sevilla Test 65.5 68.5 65.5 41.6 41.6 NaN NaN -0.404
## 4 arima_mn2 Sevilla Test 62.2 65.8 62.2 38.2 38.2 NaN NaN -0.333
## 5 arima_mn3 Sevilla Test 61.7 65.2 61.7 37.9 37.9 NaN NaN -0.297
## 6 SNaive Sevilla Test 35.1 48.5 39.1 21.1 23.8 NaN NaN -0.360
fabletools::accuracy(Sev_fc_h14, Sev_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Sevilla Test 60.7 67.9 60.7 38.6 38.6 NaN NaN 0.357
## 2 arima_at2 Sevilla Test 60.0 67.4 60.0 38.1 38.1 NaN NaN 0.352
## 3 arima_mn1 Sevilla Test 66.8 73.0 66.8 44.3 44.3 NaN NaN 0.362
## 4 arima_mn2 Sevilla Test 61.4 68.5 61.4 39.0 39.0 NaN NaN 0.346
## 5 arima_mn3 Sevilla Test 60.7 67.9 60.7 38.6 38.6 NaN NaN 0.357
## 6 SNaive Sevilla Test 25.8 50.2 38.6 13.5 24.1 NaN NaN 0.299
fabletools::accuracy(Sev_fc_h21, Sev_N_cases_tt)
## # A tibble: 6 x 11
## .model sub_region_2 .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima_at1 Sevilla Test 80.9 94.9 80.9 45.7 45.7 NaN NaN 0.578
## 2 arima_at2 Sevilla Test 80.6 94.9 80.6 45.4 45.4 NaN NaN 0.580
## 3 arima_mn1 Sevilla Test 89.1 103. 89.1 51.8 51.8 NaN NaN 0.600
## 4 arima_mn2 Sevilla Test 81.7 95.7 81.7 46.1 46.1 NaN NaN 0.576
## 5 arima_mn3 Sevilla Test 80.9 94.9 80.9 45.7 45.7 NaN NaN 0.578
## 6 SNaive Sevilla Test 40.0 69.2 52.5 18.5 29.0 NaN NaN 0.547
# Plots
Sev_fc_h7 %>%
autoplot(Sev_N_cases_tt) +
labs(title="Sevilla - forecast h7")
Sev_fc_h14 %>%
autoplot(Sev_N_cases_tt) +
labs(title="Sevilla - forecast h14")
Sev_fc_h21 %>%
autoplot(Sev_N_cases_tt) +
labs(title="Sevilla - forecast h21")
Baayen, R Harald. 2008. Analyzing Linguistic Data: A Practical Introduction to Statistics Using R. Cambridge University Press.
Hothorn, Torsten, and Brian S Everitt. 2014. A Handbook of Statistical Analyses Using R. CRC press.
Hyndman, Rob J., and George Athanasopoulos. 2021. “Forecasting: Principles and Practice, 3rd.” OTexts: Melbourne, Australia. OTexts.com.
Liviano Solas, Daniel, and Maria Pujol Jover. n.d. Analisis de Datos Y Estadastica Descriptiva Con R Y R-Commander. UOC.
Teetor, Paul. 2011. R Cookbook: Proven Recipes for Data Analysis, Statistics, and Graphics. O’Reilly Media, Inc.
Vegas Lozano, Esteban. n.d. Preprocesamiento de Los Datos. UOC.